Index: llvm/trunk/docs/LangRef.rst =================================================================== --- llvm/trunk/docs/LangRef.rst +++ llvm/trunk/docs/LangRef.rst @@ -11342,12 +11342,12 @@ :: - declare <16 x float> @llvm.masked.load.v16f32 (<16 x float>* , i32 , <16 x i1> , <16 x float> ) - declare <2 x double> @llvm.masked.load.v2f64 (<2 x double>* , i32 , <2 x i1> , <2 x double> ) + declare <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* , i32 , <16 x i1> , <16 x float> ) + declare <2 x double> @llvm.masked.load.v2f64.p0v2f64 (<2 x double>* , i32 , <2 x i1> , <2 x double> ) ;; The data is a vector of pointers to double - declare <8 x double*> @llvm.masked.load.v8p0f64 (<8 x double*>* , i32 , <8 x i1> , <8 x double*> ) + declare <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64 (<8 x double*>* , i32 , <8 x i1> , <8 x double*> ) ;; The data is a vector of function pointers - declare <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f (<8 x i32 ()*>* , i32 , <8 x i1> , <8 x i32 ()*> ) + declare <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f (<8 x i32 ()*>* , i32 , <8 x i1> , <8 x i32 ()*> ) Overview: """"""""" @@ -11370,7 +11370,7 @@ :: - %res = call <16 x float> @llvm.masked.load.v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru) + %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru) ;; The result of the two following instructions is identical aside from potential memory access exception %loadlal = load <16 x float>, <16 x float>* %ptr, align 4 @@ -11387,12 +11387,12 @@ :: - declare void @llvm.masked.store.v8i32 (<8 x i32> , <8 x i32>* , i32 , <8 x i1> ) - declare void @llvm.masked.store.v16f32 (<16 x float> , <16 x float>* , i32 , <16 x i1> ) + declare void @llvm.masked.store.v8i32.p0v8i32 (<8 x i32> , <8 x i32>* , i32 , <8 x i1> ) + declare void @llvm.masked.store.v16f32.p0v16f32 (<16 x float> , <16 x float>* , i32 , <16 x i1> ) ;; The data is a vector of pointers to double - declare void @llvm.masked.store.v8p0f64 (<8 x double*> , <8 x double*>* , i32 , <8 x i1> ) + declare void @llvm.masked.store.v8p0f64.p0v8p0f64 (<8 x double*> , <8 x double*>* , i32 , <8 x i1> ) ;; The data is a vector of function pointers - declare void @llvm.masked.store.v4p0f_i32f (<4 x i32 ()*> , <4 x i32 ()*>* , i32 , <4 x i1> ) + declare void @llvm.masked.store.v4p0f_i32f.p0v4p0f_i32f (<4 x i32 ()*> , <4 x i32 ()*>* , i32 , <4 x i1> ) Overview: """"""""" @@ -11413,7 +11413,7 @@ :: - call void @llvm.masked.store.v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask) + call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask) ;; The result of the following instructions is identical aside from potential data races and memory access exceptions %oldval = load <16 x float>, <16 x float>* %ptr, align 4 Index: llvm/trunk/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/trunk/include/llvm/IR/IRBuilder.h +++ llvm/trunk/include/llvm/IR/IRBuilder.h @@ -520,9 +520,9 @@ private: /// \brief Create a call to a masked intrinsic with given Id. - /// Masked intrinsic has only one overloaded type - data type. CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops, - Type *DataTy, const Twine &Name = ""); + ArrayRef OverloadedTypes, + const Twine &Name = ""); Value *getCastedInt8PtrValue(Value *Ptr); }; Index: llvm/trunk/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/trunk/include/llvm/IR/Intrinsics.td +++ llvm/trunk/include/llvm/IR/Intrinsics.td @@ -632,13 +632,14 @@ //===-------------------------- Masked Intrinsics -------------------------===// // -def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, +def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, + LLVMAnyPointerType>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>], [IntrReadWriteArgMem]>; def int_masked_load : Intrinsic<[llvm_anyvector_ty], - [LLVMPointerTo<0>, llvm_i32_ty, + [LLVMAnyPointerType>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>], [IntrReadArgMem]>; Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -145,6 +145,31 @@ break; } + case 'm': { + if (Name.startswith("masked.load.")) { + Type *Tys[] = { F->getReturnType(), F->arg_begin()->getType() }; + if (F->getName() != Intrinsic::getName(Intrinsic::masked_load, Tys)) { + F->setName(Name + ".old"); + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::masked_load, + Tys); + return true; + } + } + if (Name.startswith("masked.store.")) { + auto Args = F->getFunctionType()->params(); + Type *Tys[] = { Args[0], Args[1] }; + if (F->getName() != Intrinsic::getName(Intrinsic::masked_store, Tys)) { + F->setName(Name + ".old"); + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::masked_store, + Tys); + return true; + } + } + break; + } + case 'o': // We only need to change the name to match the mangling including the // address space. @@ -790,6 +815,15 @@ CI->eraseFromParent(); return; } + + case Intrinsic::masked_load: + case Intrinsic::masked_store: { + SmallVector Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args)); + CI->eraseFromParent(); + return; + } } } Index: llvm/trunk/lib/IR/IRBuilder.cpp =================================================================== --- llvm/trunk/lib/IR/IRBuilder.cpp +++ llvm/trunk/lib/IR/IRBuilder.cpp @@ -212,13 +212,15 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru, const Twine &Name) { - // DataTy is the overloaded type - Type *DataTy = cast(Ptr->getType())->getElementType(); + PointerType *PtrTy = cast(Ptr->getType()); + Type *DataTy = PtrTy->getElementType(); assert(DataTy->isVectorTy() && "Ptr should point to a vector"); if (!PassThru) PassThru = UndefValue::get(DataTy); + Type *OverloadedTypes[] = { DataTy, PtrTy }; Value *Ops[] = { Ptr, getInt32(Align), Mask, PassThru}; - return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name); + return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, + OverloadedTypes, Name); } /// \brief Create a call to a Masked Store intrinsic. @@ -229,19 +231,22 @@ /// be accessed in memory CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask) { + PointerType *PtrTy = cast(Ptr->getType()); + Type *DataTy = PtrTy->getElementType(); + assert(DataTy->isVectorTy() && "Ptr should point to a vector"); + Type *OverloadedTypes[] = { DataTy, PtrTy }; Value *Ops[] = { Val, Ptr, getInt32(Align), Mask }; - // Type of the data to be stored - the only one overloaded type - return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, Val->getType()); + return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes); } /// Create a call to a Masked intrinsic, with given intrinsic Id, -/// an array of operands - Ops, and one overloaded type - DataTy +/// an array of operands - Ops, and an array of overloaded types - +/// OverloadedTypes. CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops, - Type *DataTy, + ArrayRef OverloadedTypes, const Twine &Name) { Module *M = BB->getParent()->getParent(); - Type *OverloadedTypes[] = { DataTy }; Value *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes); return createCallHelper(TheFn, Ops, this, Name); } @@ -270,7 +275,7 @@ // We specify only one type when we create this intrinsic. Types of other // arguments are derived from this type. - return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, DataTy, Name); + return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, { DataTy }, Name); } /// \brief Create a call to a Masked Scatter intrinsic. @@ -300,7 +305,7 @@ // We specify only one type when we create this intrinsic. Types of other // arguments are derived from this type. - return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, DataTy); + return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, { DataTy }); } template Index: llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ llvm/trunk/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -7,7 +7,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define <2 x double> @test1(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { %mask = icmp eq <2 x i64> %trigger, zeroinitializer - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) ret <2 x double> %res } @@ -15,7 +15,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define <4 x i32> @test2(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) ret <4 x i32> %res } @@ -23,7 +23,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define void @test3(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) ret void } @@ -31,7 +31,7 @@ ; AVX2: Found an estimated cost of 4 {{.*}}.masked define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) ret <8 x float> %res } @@ -39,7 +39,7 @@ ; AVX2: Found an estimated cost of 5 {{.*}}.masked define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -47,7 +47,7 @@ ; AVX2: Found an estimated cost of 6 {{.*}}.masked define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -55,7 +55,7 @@ ; AVX2: Found an estimated cost of 5 {{.*}}.masked define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) ret <2 x float> %res } @@ -63,7 +63,7 @@ ; AVX2: Found an estimated cost of 6 {{.*}}.masked define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) ret <2 x i32> %res } @@ -279,24 +279,22 @@ declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) -declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) -declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) -declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) -declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) -declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) -declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) -declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) -declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) -declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) -declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) -declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) -declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) -declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) -declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) -declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) -declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) - +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) Index: llvm/trunk/test/Assembler/auto_upgrade_intrinsics.ll =================================================================== --- llvm/trunk/test/Assembler/auto_upgrade_intrinsics.ll +++ llvm/trunk/test/Assembler/auto_upgrade_intrinsics.ll @@ -54,7 +54,24 @@ define i32 @test.objectsize() { ; CHECK-LABEL: @test.objectsize( ; CHECK: @llvm.objectsize.i32.p0i8 -; CHECK-DAG: declare i32 @llvm.objectsize.i32.p0i8 %s = call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false) ret i32 %s } + +declare <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) + +define <2 x double> @tests.masked.load(<2 x double>* %ptr, <2 x i1> %mask, <2 x double> %passthru) { +; CHECK-LABEL: @tests.masked.load( +; CHECK: @llvm.masked.load.v2f64.p0v2f64 + %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 1, <2 x i1> %mask, <2 x double> %passthru) + ret <2 x double> %res +} + +declare void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask) + +define void @tests.masked.store(<2 x double>* %ptr, <2 x i1> %mask, <2 x double> %val) { +; CHECK-LABEL: @tests.masked.store( +; CHECK: @llvm.masked.store.v2f64.p0v2f64 + call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> %mask) + ret void +} \ No newline at end of file Index: llvm/trunk/test/CodeGen/X86/avx512-bugfix-26264.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-bugfix-26264.ll +++ llvm/trunk/test/CodeGen/X86/avx512-bugfix-26264.ll @@ -18,7 +18,7 @@ ; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 ; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 ; AVX512BW-NEXT: retq - %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) + %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res } @@ -39,9 +39,9 @@ ; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 ; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 ; AVX512BW-NEXT: retq - %res = call <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) + %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) ret <32 x i64> %res } -declare <32 x i64> @llvm.masked.load.v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0) -declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) +declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0) +declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) Index: llvm/trunk/test/CodeGen/X86/masked_memop.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_memop.ll +++ llvm/trunk/test/CodeGen/X86/masked_memop.ll @@ -40,7 +40,7 @@ ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) + %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) ret <16 x i32> %res } @@ -76,7 +76,7 @@ ; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) + %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) ret <16 x i32> %res } @@ -114,7 +114,7 @@ ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) + call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) ret void } @@ -155,7 +155,7 @@ ; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) + %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) ret <16 x float> %res } @@ -210,7 +210,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) + %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) ret <8 x double> %res } @@ -239,7 +239,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) ret <2 x double> %res } @@ -268,7 +268,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) ret <4 x float> %res } @@ -305,7 +305,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) ret <4 x i32> %res } @@ -338,7 +338,7 @@ ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) ret void } @@ -381,7 +381,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst) ret <4 x double> %res } @@ -420,7 +420,7 @@ ; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer) ret <4 x double> %res } @@ -462,7 +462,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst) ret <8 x float> %res } @@ -507,7 +507,7 @@ ; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq - %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) + %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst) ret <8 x i32> %res } @@ -548,7 +548,7 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k1 ; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) ret <8 x float> %res } @@ -589,7 +589,7 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k1 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) + %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer) ret <8 x i32> %res } @@ -629,7 +629,7 @@ ; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) + call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) ret void } @@ -667,7 +667,7 @@ ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) + call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) ret void } @@ -712,7 +712,7 @@ ; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -758,7 +758,7 @@ ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) ret void } @@ -807,7 +807,7 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) ret <2 x float> %res } @@ -863,7 +863,7 @@ ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) ret <2 x i32> %res } @@ -908,7 +908,7 @@ ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer - %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) + %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) ret <2 x float> %res } @@ -931,7 +931,7 @@ ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float>undef) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float>undef) ret <4 x float> %res } @@ -960,7 +960,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float> %dst) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float> %dst) ret <4 x float> %res } @@ -994,7 +994,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ; SKX-NEXT: retq - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1> , <4 x i32> %dst) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> , <4 x i32> %dst) ret <4 x i32> %res } @@ -1021,7 +1021,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1} ; SKX-NEXT: retq - %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1> , <8 x float> %dst) + %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> , <8 x float> %dst) ret <8 x float> %res } @@ -1046,7 +1046,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} ; SKX-NEXT: retq - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %dst) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %dst) ret <4 x double> %res } @@ -1080,7 +1080,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} ; SKX-NEXT: retq - %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> , <8 x i32> %dst) + %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> , <8 x i32> %dst) ret <8 x i32> %res } @@ -1112,7 +1112,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} ; SKX-NEXT: retq - %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %dst) + %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %dst) ret <4 x i64> %res } @@ -1135,7 +1135,7 @@ ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovupd (%rdi), %zmm0 {%k1} ; AVX512-NEXT: retq - %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %dst) + %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %dst) ret <8 x double> %res } @@ -1160,7 +1160,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> undef) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> undef) ret <4 x double> %res } @@ -1189,7 +1189,7 @@ ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> undef) + %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> undef) ret <4 x i64> %res } @@ -1218,7 +1218,7 @@ ; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer - call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>) ret void } @@ -1234,7 +1234,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vmovd %xmm0, (%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) ret void } @@ -1250,7 +1250,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1>) ret void } @@ -1269,7 +1269,7 @@ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, 16(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) ret void } @@ -1288,7 +1288,7 @@ ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) + call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) ret void } @@ -1307,7 +1307,7 @@ ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi) ; AVX512-NEXT: retq - call void @llvm.masked.store.v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) + call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) ret void } @@ -1323,7 +1323,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; AVX512-NEXT: retq - %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>, <4 x i32> %val) + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>, <4 x i32> %val) ret <4 x i32> %res } @@ -1339,7 +1339,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX512-NEXT: retq - %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float> %val) + %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>, <4 x float> %val) ret <4 x float> %res } @@ -1373,7 +1373,7 @@ ; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq - %res = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val) + %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val) ret <4 x i64> %res } @@ -1400,7 +1400,7 @@ ; SKX-NEXT: vmovhpd 24(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0] ; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq - %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val) + %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val) ret <4 x double> %res } @@ -1421,37 +1421,36 @@ ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq - %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) + %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) ret <8 x double> %res } -declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) -declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) -declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) -declare <4 x i64> @llvm.masked.load.v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) -declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) -declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) -declare void @llvm.masked.store.v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) -declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) -declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) -declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) -declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) -declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) -declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) -declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) -declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) -declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) -declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) -declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) -declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) -declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>) +declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) -declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) +declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { ; AVX1-LABEL: test23: @@ -1501,13 +1500,13 @@ ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ; AVX512-NEXT: retq %mask = icmp eq <16 x i32*> %trigger, zeroinitializer - %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) + %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) ret <16 x i32*> %res } %mystruct = type { i16, i16, [1 x i8*] } -declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) +declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { ; AVX1-LABEL: test24: @@ -1596,7 +1595,7 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} ; SKX-NEXT: retq - %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) + %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) ret <16 x %mystruct*> %res } @@ -1687,10 +1686,10 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { ; AVX1-LABEL: test_store_16f64: @@ -1779,10 +1778,10 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { ; AVX1-LABEL: test_load_16i64: @@ -1883,10 +1882,10 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: vmovaps %zmm2, %zmm1 ; SKX-NEXT: retq - %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) + %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res } -declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) +declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { ; AVX1-LABEL: test_load_16f64: @@ -1987,10 +1986,10 @@ ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: vmovaps %zmm2, %zmm1 ; SKX-NEXT: retq - %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) + %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res } -declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) +declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { ; AVX1-LABEL: test_load_32f64: @@ -2218,10 +2217,11 @@ ; SKX-NEXT: vmovaps %zmm3, %zmm2 ; SKX-NEXT: vmovaps %zmm4, %zmm3 ; SKX-NEXT: retq - %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) + %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res } -declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) + +declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { ; SKX-LABEL: test_mask_load_16xi8: @@ -2230,10 +2230,10 @@ ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef) + %res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef) ret <16 x i8> %res } -declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { ; SKX-LABEL: test_mask_load_32xi8: @@ -2242,10 +2242,10 @@ ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer) + %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer) ret <32 x i8> %res } -declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) +declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) { ; SKX-LABEL: test_mask_load_64xi8: @@ -2255,10 +2255,10 @@ ; SKX-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq - %res = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val) + %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val) ret <64 x i8> %res } -declare <64 x i8> @llvm.masked.load.v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>) +declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>) define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { ; SKX-LABEL: test_mask_load_8xi16: @@ -2267,10 +2267,10 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k1 ; SKX-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef) + %res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef) ret <8 x i16> %res } -declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { ; SKX-LABEL: test_mask_load_16xi16: @@ -2279,10 +2279,10 @@ ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ; SKX-NEXT: retq - %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer) + %res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer) ret <16 x i16> %res } -declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>) +declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>) define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) { ; SKX-LABEL: test_mask_load_32xi16: @@ -2292,10 +2292,10 @@ ; SKX-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq - %res = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val) + %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val) ret <32 x i16> %res } -declare <32 x i16> @llvm.masked.load.v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>) +declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>) define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { ; SKX-LABEL: test_mask_store_16xi8: @@ -2304,10 +2304,10 @@ ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask) + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask) ret void } -declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { ; SKX-LABEL: test_mask_store_32xi8: @@ -2316,10 +2316,10 @@ ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask) + call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask) ret void } -declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>) +declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>) define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) { ; SKX-LABEL: test_mask_store_64xi8: @@ -2328,10 +2328,10 @@ ; SKX-NEXT: vpmovb2m %zmm0, %k1 ; SKX-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask) + call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask) ret void } -declare void @llvm.masked.store.v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>) +declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>) define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { ; SKX-LABEL: test_mask_store_8xi16: @@ -2340,10 +2340,10 @@ ; SKX-NEXT: vpmovw2m %xmm0, %k1 ; SKX-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask) ret void } -declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { ; SKX-LABEL: test_mask_store_16xi16: @@ -2352,10 +2352,10 @@ ; SKX-NEXT: vpmovb2m %xmm0, %k1 ; SKX-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask) + call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask) ret void } -declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>) +declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>) define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) { ; SKX-LABEL: test_mask_store_32xi16: @@ -2364,7 +2364,8 @@ ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} ; SKX-NEXT: retq - call void @llvm.masked.store.v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask) + call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask) ret void } -declare void @llvm.masked.store.v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>) + +declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>) Index: llvm/trunk/test/Transforms/InstCombine/masked_intrinsics.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/masked_intrinsics.ll +++ llvm/trunk/test/Transforms/InstCombine/masked_intrinsics.ll @@ -1,12 +1,12 @@ ; RUN: opt -instcombine -S < %s | FileCheck %s -declare <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) -declare void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask) +declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) +declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask) declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru) declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask) define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru) { - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru) ret <2 x double> %res ; CHECK-LABEL: @load_zeromask( @@ -14,7 +14,7 @@ } define <2 x double> @load_onemask(<2 x double>* %ptr, <2 x double> %passthru) { - %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru) + %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 2, <2 x i1> , <2 x double> %passthru) ret <2 x double> %res ; CHECK-LABEL: @load_onemask( @@ -23,7 +23,7 @@ } define void @store_zeromask(<2 x double>* %ptr, <2 x double> %val) { - call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> zeroinitializer) + call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 3, <2 x i1> zeroinitializer) ret void ; CHECK-LABEL: @store_zeromask( @@ -31,7 +31,7 @@ } define void @store_onemask(<2 x double>* %ptr, <2 x double> %val) { - call void @llvm.masked.store.v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> ) + call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptr, i32 4, <2 x i1> ) ret void ; CHECK-LABEL: @store_onemask( Index: llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll @@ -53,7 +53,7 @@ ; CHECK-LABEL: @mload_one_one( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* -; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %castvec, i32 1, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %castvec, i32 1, <4 x i1> , <4 x float> undef) ; CHECK-NEXT: ret <4 x float> %1 } @@ -65,7 +65,7 @@ ; CHECK-LABEL: @mload_one_one_double( ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>* -; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %castvec, i32 1, <2 x i1> , <2 x double> undef) +; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %castvec, i32 1, <2 x i1> , <2 x double> undef) ; CHECK-NEXT: ret <2 x double> %1 } @@ -77,7 +77,7 @@ ; CHECK-LABEL: @mload_v8f32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>* -; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %castvec, i32 1, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %castvec, i32 1, <8 x i1> , <8 x float> undef) ; CHECK-NEXT: ret <8 x float> %1 } @@ -87,7 +87,7 @@ ; CHECK-LABEL: @mload_v4f64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>* -; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %castvec, i32 1, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %castvec, i32 1, <4 x i1> , <4 x double> undef) ; CHECK-NEXT: ret <4 x double> %1 } @@ -99,7 +99,7 @@ ; CHECK-LABEL: @mload_v4i32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>* -; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> , <4 x i32> undef) ; CHECK-NEXT: ret <4 x i32> %1 } @@ -109,7 +109,7 @@ ; CHECK-LABEL: @mload_v2i64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>* -; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> , <2 x i64> undef) +; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> , <2 x i64> undef) ; CHECK-NEXT: ret <2 x i64> %1 } @@ -119,7 +119,7 @@ ; CHECK-LABEL: @mload_v8i32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>* -; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> , <8 x i32> undef) ; CHECK-NEXT: ret <8 x i32> %1 } @@ -129,7 +129,7 @@ ; CHECK-LABEL: @mload_v4i64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>* -; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> , <4 x i64> undef) +; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> , <4 x i64> undef) ; CHECK-NEXT: ret <4 x i64> %1 } @@ -187,7 +187,7 @@ ; CHECK-LABEL: @mstore_one_one( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v4f32(<4 x float> %v, <4 x float>* %castvec, i32 1, <4 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %v, <4 x float>* %castvec, i32 1, <4 x i1> ) ; CHECK-NEXT: ret void } @@ -199,7 +199,7 @@ ; CHECK-LABEL: @mstore_one_one_double( ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>* -; CHECK-NEXT: call void @llvm.masked.store.v2f64(<2 x double> %v, <2 x double>* %castvec, i32 1, <2 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %v, <2 x double>* %castvec, i32 1, <2 x i1> ) ; CHECK-NEXT: ret void } @@ -211,7 +211,7 @@ ; CHECK-LABEL: @mstore_v8f32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32(<8 x float> %v, <8 x float>* %castvec, i32 1, <8 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %v, <8 x float>* %castvec, i32 1, <8 x i1> ) ; CHECK-NEXT: ret void } @@ -221,7 +221,7 @@ ; CHECK-LABEL: @mstore_v4f64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>* -; CHECK-NEXT: call void @llvm.masked.store.v4f64(<4 x double> %v, <4 x double>* %castvec, i32 1, <4 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %v, <4 x double>* %castvec, i32 1, <4 x i1> ) ; CHECK-NEXT: ret void } @@ -233,7 +233,7 @@ ; CHECK-LABEL: @mstore_v4i32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v4i32(<4 x i32> %v, <4 x i32>* %castvec, i32 1, <4 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v, <4 x i32>* %castvec, i32 1, <4 x i1> ) ; CHECK-NEXT: ret void } @@ -243,7 +243,7 @@ ; CHECK-LABEL: @mstore_v2i64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>* -; CHECK-NEXT: call void @llvm.masked.store.v2i64(<2 x i64> %v, <2 x i64>* %castvec, i32 1, <2 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %v, <2 x i64>* %castvec, i32 1, <2 x i1> ) ; CHECK-NEXT: ret void } @@ -253,7 +253,7 @@ ; CHECK-LABEL: @mstore_v8i32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>* -; CHECK-NEXT: call void @llvm.masked.store.v8i32(<8 x i32> %v, <8 x i32>* %castvec, i32 1, <8 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %v, <8 x i32>* %castvec, i32 1, <8 x i1> ) ; CHECK-NEXT: ret void } @@ -263,7 +263,7 @@ ; CHECK-LABEL: @mstore_v4i64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>* -; CHECK-NEXT: call void @llvm.masked.store.v4i64(<4 x i64> %v, <4 x i64>* %castvec, i32 1, <4 x i1> ) +; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %v, <4 x i64>* %castvec, i32 1, <4 x i1> ) ; CHECK-NEXT: ret void } Index: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -18,16 +18,16 @@ ;AVX-LABEL: @foo1 ;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32 +;AVX: call <8 x i32> @llvm.masked.load.v8i32.p0v8i32 ;AVX: add nsw <8 x i32> -;AVX: call void @llvm.masked.store.v8i32 +;AVX: call void @llvm.masked.store.v8i32.p0v8i32 ;AVX: ret void ;AVX512-LABEL: @foo1 ;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16i32 +;AVX512: call <16 x i32> @llvm.masked.load.v16i32.p0v16i32 ;AVX512: add nsw <16 x i32> -;AVX512: call void @llvm.masked.store.v16i32 +;AVX512: call void @llvm.masked.store.v16i32.p0v16i32 ;AVX512: ret void ; Function Attrs: nounwind uwtable @@ -89,6 +89,81 @@ ret void } +; The same as @foo1 but all the pointers are address space 1 pointers. + +;AVX-LABEL: @foo1_addrspace1 +;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32.p1v8i32 +;AVX: add nsw <8 x i32> +;AVX: call void @llvm.masked.store.v8i32.p1v8i32 +;AVX: ret void + +;AVX512-LABEL: @foo1_addrspace1 +;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16i32.p1v16i32 +;AVX512: add nsw <16 x i32> +;AVX512: call void @llvm.masked.store.v16i32.p1v16i32 +;AVX512: ret void + +; Function Attrs: nounwind uwtable +define void @foo1_addrspace1(i32 addrspace(1)* %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) { +entry: + %A.addr = alloca i32 addrspace(1)*, align 8 + %B.addr = alloca i32 addrspace(1)*, align 8 + %trigger.addr = alloca i32 addrspace(1)*, align 8 + %i = alloca i32, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8 + store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8 + store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %cmp1 = icmp slt i32 %3, 100 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2 + %6 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4 + %9 = load i32, i32 addrspace(1)* %arrayidx5, align 4 + %add = add nsw i32 %6, %9 + %10 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %10 to i64 + %11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8 + %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6 + store i32 %add, i32 addrspace(1)* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %12 = load i32, i32* %i, align 4 + %inc = add nsw i32 %12, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + ; The source code: ; ;void foo2(float *A, float *B, int *trigger) { @@ -102,16 +177,16 @@ ;AVX-LABEL: @foo2 ;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f32 +;AVX: call <8 x float> @llvm.masked.load.v8f32.p0v8f32 ;AVX: fadd <8 x float> -;AVX: call void @llvm.masked.store.v8f32 +;AVX: call void @llvm.masked.store.v8f32.p0v8f32 ;AVX: ret void ;AVX512-LABEL: @foo2 ;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16f32 +;AVX512: call <16 x float> @llvm.masked.load.v16f32.p0v16f32 ;AVX512: fadd <16 x float> -;AVX512: call void @llvm.masked.store.v16f32 +;AVX512: call void @llvm.masked.store.v16f32.p0v16f32 ;AVX512: ret void ; Function Attrs: nounwind uwtable @@ -187,18 +262,18 @@ ;AVX-LABEL: @foo3 ;AVX: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64 +;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 ;AVX: sitofp <4 x i32> %wide.load to <4 x double> ;AVX: fadd <4 x double> -;AVX: call void @llvm.masked.store.v4f64 +;AVX: call void @llvm.masked.store.v4f64.p0v4f64 ;AVX: ret void ;AVX512-LABEL: @foo3 ;AVX512: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64 +;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64 ;AVX512: sitofp <8 x i32> %wide.load to <8 x double> ;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void @@ -429,17 +504,17 @@ ;AVX2-LABEL: @foo6 ;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer ;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> -;AVX2: call <4 x double> @llvm.masked.load.v4f64 +;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 ;AVX2: fadd <4 x double> -;AVX2: call void @llvm.masked.store.v4f64 +;AVX2: call void @llvm.masked.store.v4f64.p0v4f64 ;AVX2: ret void ;AVX512-LABEL: @foo6 ;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer ;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64 +;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64 ;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void @@ -507,8 +582,8 @@ ; } ;AVX512-LABEL: @foo7 -;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64(<8 x double*>* -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 { @@ -579,8 +654,8 @@ ;} ;AVX512-LABEL: @foo8 -;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f(<8 x i32 ()*>* % -;AVX512: call void @llvm.masked.store.v8f64 +;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* % +;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX512: ret void define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {