Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -11333,12 +11333,12 @@ :: - declare <16 x float> @llvm.masked.load.v16f32 (<16 x float>* , i32 , <16 x i1> , <16 x float> ) - declare <2 x double> @llvm.masked.load.v2f64 (<2 x double>* , i32 , <2 x i1> , <2 x double> ) + declare <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* , i32 , <16 x i1> , <16 x float> ) + declare <2 x double> @llvm.masked.load.v2f64.p0v2f64 (<2 x double>* , i32 , <2 x i1> , <2 x double> ) ;; The data is a vector of pointers to double - declare <8 x double*> @llvm.masked.load.v8p0f64 (<8 x double*>* , i32 , <8 x i1> , <8 x double*> ) + declare <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64 (<8 x double*>* , i32 , <8 x i1> , <8 x double*> ) ;; The data is a vector of function pointers - declare <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f (<8 x i32 ()*>* , i32 , <8 x i1> , <8 x i32 ()*> ) + declare <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f (<8 x i32 ()*>* , i32 , <8 x i1> , <8 x i32 ()*> ) Overview: """"""""" @@ -11349,7 +11349,7 @@ Arguments: """""""""" -The first operand is the base pointer for the load. The second operand is the alignment of the source location. It must be a constant integer value. The third operand, mask, is a vector of boolean values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the base pointer and the type of the '``passthru``' operand are the same vector types. +The first operand is the base pointer for the load. The base pointer can be in an arbitrary address space. The second operand is the alignment of the source location. It must be a constant integer value. The third operand, mask, is a vector of boolean values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the base pointer and the type of the '``passthru``' operand are the same vector types. Semantics: @@ -11361,7 +11361,7 @@ :: - %res = call <16 x float> @llvm.masked.load.v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru) + %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru) ;; The result of the two following instructions is identical aside from potential memory access exception %loadlal = load <16 x float>, <16 x float>* %ptr, align 4 @@ -11378,12 +11378,12 @@ :: - declare void @llvm.masked.store.v8i32 (<8 x i32> , <8 x i32>* , i32 , <8 x i1> ) - declare void @llvm.masked.store.v16f32 (<16 x float> , <16 x float>* , i32 , <16 x i1> ) + declare void @llvm.masked.store.v8i32.p0v8i32 (<8 x i32> , <8 x i32>* , i32 , <8 x i1> ) + declare void @llvm.masked.store.v16f32.p0v16f32 (<16 x float> , <16 x float>* , i32 , <16 x i1> ) ;; The data is a vector of pointers to double - declare void @llvm.masked.store.v8p0f64 (<8 x double*> , <8 x double*>* , i32 , <8 x i1> ) + declare void @llvm.masked.store.v8p0f64.p0v8p0f64 (<8 x double*> , <8 x double*>* , i32 , <8 x i1> ) ;; The data is a vector of function pointers - declare void @llvm.masked.store.v4p0f_i32f (<4 x i32 ()*> , <4 x i32 ()*>* , i32 , <4 x i1> ) + declare void @llvm.masked.store.v4p0f_i32f.p0v4p0f_i32f (<4 x i32 ()*> , <4 x i32 ()*>* , i32 , <4 x i1> ) Overview: """"""""" @@ -11393,7 +11393,7 @@ Arguments: """""""""" -The first operand is the vector value to be written to memory. The second operand is the base pointer for the store, it has the same underlying type as the value operand. The third operand is the alignment of the destination location. The fourth operand, mask, is a vector of boolean values. The types of the mask and the value operand must have the same number of vector elements. +The first operand is the vector value to be written to memory. The second operand is the base pointer for the store, it has the same underlying type as the value operand. The base pointer can be in an arbitrary address space. The third operand is the alignment of the destination location. The fourth operand, mask, is a vector of boolean values. The types of the mask and the value operand must have the same number of vector elements. Semantics: @@ -11404,7 +11404,7 @@ :: - call void @llvm.masked.store.v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask) + call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask) ;; The result of the following instructions is identical aside from potential data races and memory access exceptions %oldval = load <16 x float>, <16 x float>* %ptr, align 4 Index: include/llvm/IR/IRBuilder.h =================================================================== --- include/llvm/IR/IRBuilder.h +++ include/llvm/IR/IRBuilder.h @@ -512,9 +512,9 @@ private: /// \brief Create a call to a masked intrinsic with given Id. - /// Masked intrinsic has only one overloaded type - data type. + /// Masked intrinsic has two overloaded types - data type, pointer type. CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops, - Type *DataTy, const Twine &Name = ""); + PointerType *PtrTy, const Twine &Name = ""); Value *getCastedInt8PtrValue(Value *Ptr); }; Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -632,13 +632,14 @@ //===-------------------------- Masked Intrinsics -------------------------===// // -def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, +def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, + LLVMAnyPointerType>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>], [IntrReadWriteArgMem]>; def int_masked_load : Intrinsic<[llvm_anyvector_ty], - [LLVMPointerTo<0>, llvm_i32_ty, + [LLVMAnyPointerType>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>], [IntrReadArgMem]>; Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -213,13 +213,11 @@ Value *Mask, Value *PassThru, const Twine &Name) { assert(Ptr->getType()->isPointerTy() && "Ptr must be of pointer type"); - // DataTy is the overloaded type - Type *DataTy = cast(Ptr->getType())->getElementType(); - assert(DataTy->isVectorTy() && "Ptr should point to a vector"); + PointerType *PtrTy = cast(Ptr->getType()); if (!PassThru) - PassThru = UndefValue::get(DataTy); + PassThru = UndefValue::get(PtrTy->getElementType()); Value *Ops[] = { Ptr, getInt32(Align), Mask, PassThru}; - return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name); + return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, PtrTy, Name); } /// Create a call to a Masked Store intrinsic. @@ -231,18 +229,21 @@ CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask) { Value *Ops[] = { Val, Ptr, getInt32(Align), Mask }; - // Type of the data to be stored - the only one overloaded type - return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, Val->getType()); + return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, + cast(Ptr->getType())); } /// Create a call to a Masked intrinsic, with given intrinsic Id, -/// an array of operands - Ops, and one overloaded type - DataTy +/// an array of operands - Ops, and an overloaded pointer type - PtrTy. +/// Another overloaded type - data type - is derived from the PtrTy. CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops, - Type *DataTy, + PointerType *PtrTy, const Twine &Name) { Module *M = BB->getParent()->getParent(); - Type *OverloadedTypes[] = { DataTy }; + Type *DataTy = PtrTy->getElementType(); + assert(DataTy->isVectorTy() && "Ptr should point to a vector"); + Type *OverloadedTypes[] = { DataTy, PtrTy }; Value *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes); return createCallHelper(TheFn, Ops, this, Name); } Index: test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -7,7 +7,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { %mask = icmp eq <2 x i64> %trigger, zeroinitializer - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) ret <2 x double> %res } @@ -15,7 +15,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) ret <4 x i32> %res } @@ -23,7 +23,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) ret void } @@ -31,7 +31,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) ret <8 x float> %res } @@ -39,7 +39,7 @@ ; AVX2: Found an estimated cost of 5 {{.*}}.masked define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -47,7 +47,7 @@ ; AVX2: Found an estimated cost of 6 {{.*}}.masked define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -55,7 +55,7 @@ ; AVX2: Found an estimated cost of 5 {{.*}}.masked define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) ret <2 x float> %res } @@ -63,7 +63,7 @@ ; AVX2: Found an estimated cost of 6 {{.*}}.masked define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) ret <2 x i32> %res } @@ -279,24 +279,24 @@ declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) -declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) -declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) -declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) -declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) -declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) -declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) -declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) -declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) -declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) -declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) -declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) -declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) -declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) -declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) -declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) -declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare void @llvm.masked.store.v16f32p.p0v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) Index: test/CodeGen/X86/avx512-bugfix-26264.ll =================================================================== --- test/CodeGen/X86/avx512-bugfix-26264.ll +++ test/CodeGen/X86/avx512-bugfix-26264.ll @@ -18,7 +18,7 @@ ; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 ; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 ; AVX512BW-NEXT: retq - %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) + %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res } @@ -39,9 +39,9 @@ ; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 ; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 ; AVX512BW-NEXT: retq - %res = call <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) + %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) ret <32 x i64> %res } -declare <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0) -declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) +declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0) +declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -40,7 +40,7 @@ ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) + %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) ret <16 x i32> %res } @@ -76,7 +76,7 @@ ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) + %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) ret <16 x i32> %res } @@ -114,7 +114,7 @@ ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) + call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) ret void } @@ -155,7 +155,7 @@ ; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) + %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) ret <16 x float> %res } @@ -210,7 +210,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) + %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) ret <8 x double> %res } @@ -239,7 +239,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) ret <2 x double> %res } @@ -268,7 +268,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) ret <4 x float> %res } @@ -305,7 +305,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) ret <4 x i32> %res } @@ -338,7 +338,7 @@ ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) ret void } @@ -381,7 +381,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) ret <4 x double> %res } @@ -420,7 +420,7 @@ ; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer) ret <4 x double> %res } @@ -462,7 +462,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) ret <8 x float> %res } @@ -507,7 +507,7 @@ ; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq - %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) + %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) ret <8 x i32> %res } @@ -548,7 +548,7 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k1 ; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) ret <8 x float> %res } @@ -589,7 +589,7 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k1 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) + %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) ret <8 x i32> %res } @@ -629,7 +629,7 @@ ; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) + call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) ret void } @@ -667,7 +667,7 @@ ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) + call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) ret void } @@ -712,7 +712,7 @@ ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -758,7 +758,7 @@ ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -807,7 +807,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) ret <2 x float> %res } @@ -863,7 +863,7 @@ ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) ret <2 x i32> %res } @@ -908,7 +908,7 @@ ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) ret <2 x float> %res } @@ -931,7 +931,7 @@ ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float>undef) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float>undef) ret <4 x float> %res } @@ -958,7 +958,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1>, <4 x float> %src0) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 16, <4 x i1>, <4 x float> %src0) ret <4 x float> %res } @@ -987,7 +987,7 @@ ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) ret void } @@ -1003,7 +1003,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vmovd %xmm0, (%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) ret void } @@ -1019,7 +1019,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) ret void } @@ -1038,7 +1038,7 @@ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, 16(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) ret void } @@ -1057,7 +1057,7 @@ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) ret void } @@ -1076,7 +1076,7 @@ ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) + call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) ret void } @@ -1092,7 +1092,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; AVX512-NEXT: retq - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>, <4 x i32> %val) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>, <4 x i32> %val) ret <4 x i32> %res } @@ -1108,7 +1108,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX512-NEXT: retq - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float> %val) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float> %val) ret <4 x float> %res } @@ -1142,7 +1142,7 @@ ; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq - %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val) + %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val) ret <4 x i64> %res } @@ -1169,7 +1169,7 @@ ; SKX-NEXT: vmovhpd 24(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0] ; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val) ret <4 x double> %res } @@ -1190,37 +1190,37 @@ ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq - %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) + %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) ret <8 x double> %res } -declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) -declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) -declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) -declare <4 x i64> @llvm.masked.load.v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) -declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) -declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) -declare void @llvm.masked.store.v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) -declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) -declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) -declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) -declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) -declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) -declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) -declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) -declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) -declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) -declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) -declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) -declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare void @llvm.masked.store.v16f32p.p0v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) -declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) +declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { ; AVX1-LABEL: test23: @@ -1270,13 +1270,13 @@ ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32*> %trigger, zeroinitializer - %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) + %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) ret <16 x i32*> %res } %mystruct = type { i16, i16, [1 x i8*] } -declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) +declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { ; AVX1-LABEL: test24: @@ -1365,7 +1365,7 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} ; SKX-NEXT: retq - %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) + %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) ret <16 x %mystruct*> %res } @@ -1456,10 +1456,10 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { ; AVX1-LABEL: test_store_16f64: @@ -1548,10 +1548,10 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { ; AVX1-LABEL: test_load_16i64: @@ -1652,10 +1652,10 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: vmovaps %zmm2, %zmm1 ; SKX-NEXT: retq - %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) + %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res } -declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) +declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { ; AVX1-LABEL: test_load_16f64: @@ -1756,10 +1756,10 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: vmovaps %zmm2, %zmm1 ; SKX-NEXT: retq - %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) + %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res } -declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) +declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { ; AVX1-LABEL: test_load_32f64: @@ -1987,7 +1987,7 @@ ; SKX-NEXT: vmovaps %zmm3, %zmm2 ; SKX-NEXT: vmovaps %zmm4, %zmm3 ; SKX-NEXT: retq - %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) + %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res } -declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) +declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) Index: test/Transforms/InstCombine/masked_intrinsics.ll =================================================================== --- test/Transforms/InstCombine/masked_intrinsics.ll +++ test/Transforms/InstCombine/masked_intrinsics.ll @@ -1,12 +1,12 @@ ; RUN: opt -instcombine -S < %s | FileCheck %s -declare <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) -declare void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask) declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru) declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask) define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru) { - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru) ret <2 x double> %res ; CHECK-LABEL: @load_zeromask( @@ -14,7 +14,7 @@ } define <2 x double> @load_onemask(<2 x double>* %ptr, <2 x double> %passthru) { - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru) ret <2 x double> %res ; CHECK-LABEL: @load_onemask( @@ -23,7 +23,7 @@ } define void @store_zeromask(<2 x double>* %ptr, <2 x double> %val) { - call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> zeroinitializer) + call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> zeroinitializer) ret void ; CHECK-LABEL: @store_zeromask( @@ -31,7 +31,7 @@ } define void @store_onemask(<2 x double>* %ptr, <2 x double> %val) { - call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> ) + call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> ) ret void ; CHECK-LABEL: @store_onemask( Index: test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -18,16 +18,16 @@ ;AVX-LABEL: @foo1 ;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32 +;AVX: call <8 x i32> @llvm.masked.load.v8i32.p0v8i32 ;AVX: add nsw <8 x i32> -;AVX: call void @llvm.masked.store.v8i32 +;AVX: call void @llvm.masked.store.v8i32.p0v8i32 ;AVX: ret void ;AVX512-LABEL: @foo1 ;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16i32 +;AVX512: call <16 x i32> @llvm.masked.load.v16i32.p0v16i32 ;AVX512: add nsw <16 x i32> -;AVX512: call void @llvm.masked.store.v16i32 +;AVX512: call void @llvm.masked.store.v16i32.p0v16i32 ;AVX512: ret void ; Function Attrs: nounwind uwtable @@ -89,6 +89,81 @@ ret void } +; The same as @foo1 but all the pointers are address space 1 pointers. + +;AVX-LABEL: @foo1_addrspace1 +;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32.p1v8i32 +;AVX: add nsw <8 x i32> +;AVX: call void @llvm.masked.store.v8i32.p1v8i32 +;AVX: ret void + +;AVX512-LABEL: @foo1_addrspace1 +;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16i32.p1v16i32 +;AVX512: add nsw <16 x i32> +;AVX512: call void @llvm.masked.store.v16i32.p1v16i32 +;AVX512: ret void + +; Function Attrs: nounwind uwtable +define void @foo1_addrspace1(i32 addrspace(1)* %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) { +entry: + %A.addr = alloca i32 addrspace(1)*, align 8 + %B.addr = alloca i32 addrspace(1)*, align 8 + %trigger.addr = alloca i32 addrspace(1)*, align 8 + %i = alloca i32, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8 + store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8 + store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %cmp1 = icmp slt i32 %3, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2 + %6 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4 + %9 = load i32, i32 addrspace(1)* %arrayidx5, align 4 + %add = add nsw i32 %6, %9 + %10 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %10 to i64 + %11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8 + %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6 + store i32 %add, i32 addrspace(1)* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32, i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + ; The source code: ; ;void foo2(float *A, float *B, int *trigger) { @@ -102,16 +177,16 @@ ;AVX-LABEL: @foo2 ;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f32 +;AVX: call <8 x float> @llvm.masked.load.v8f32.p0v8f32 ;AVX: fadd <8 x float> -;AVX: call void @llvm.masked.store.v8f32 +;AVX: call void @llvm.masked.store.v8f32.p0v8f32 ;AVX: ret void ;AVX512-LABEL: @foo2 ;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16f32 +;AVX512: call <16 x float> @llvm.masked.load.v16f32.p0v16f32 ;AVX512: fadd <16 x float> -;AVX512: call void @llvm.masked.store.v16f32 +;AVX512: call void @llvm.masked.store.v16f32.p0v16f32 ;AVX512: ret void ; Function Attrs: nounwind uwtable @@ -187,18 +262,18 @@ ;AVX-LABEL: @foo3 ;AVX: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64 +;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 ;AVX: sitofp <4 x i32> %wide.load to <4 x double> ;AVX: fadd <4 x double> -;AVX: call void @llvm.masked.store.v4f64 +;AVX: call void @llvm.masked.store.v4f64.p0v4f64 ;AVX: ret void ;AVX512-LABEL: @foo3 ;AVX512: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64 +;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64 ;AVX512: sitofp <8 x i32> %wide.load to <8 x double> ;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void @@ -428,17 +503,17 @@ ;AVX2-LABEL: @foo6 ;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer ;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> -;AVX2: call <4 x double> @llvm.masked.load.v4f64 +;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 ;AVX2: fadd <4 x double> -;AVX2: call void @llvm.masked.store.v4f64 +;AVX2: call void @llvm.masked.store.v4f64.p0v4f64 ;AVX2: ret void ;AVX512-LABEL: @foo6 ;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer ;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64 +;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64 ;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void @@ -506,8 +581,8 @@ ; } ;AVX512-LABEL: @foo7 -;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64(<8 x double*>* -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 { @@ -578,8 +653,8 @@ ;} ;AVX512-LABEL: @foo8 -;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f(<8 x i32 ()*>* % -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* % +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {