Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -7904,7 +7904,7 @@ ; get pointers for 8 elements from array B %ptrs = getelementptr double, double* %B, <8 x i32> %C ; load 8 elements from array B into A - %A = call <8 x double> @llvm.masked.gather.v8f64(<8 x double*> %ptrs, + %A = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs, i32 8, <8 x i1> %mask, <8 x double> %passthru) Conversion Operations @@ -12013,9 +12013,9 @@ :: - declare <16 x float> @llvm.masked.gather.v16f32 (<16 x float*> , i32 , <16 x i1> , <16 x float> ) - declare <2 x double> @llvm.masked.gather.v2f64 (<2 x double*> , i32 , <2 x i1> , <2 x double> ) - declare <8 x float*> @llvm.masked.gather.v8p0f32 (<8 x float**> , i32 , <8 x i1> , <8 x float*> ) + declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32 (<16 x float*> , i32 , <16 x i1> , <16 x float> ) + declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64 (<2 x double*> , i32 , <2 x i1> , <2 x double> ) + declare <8 x float*> @llvm.masked.gather.v8p0f32.v8p0p0f32 (<8 x float**> , i32 , <8 x i1> , <8 x float*> ) Overview: """"""""" @@ -12038,7 +12038,7 @@ :: - %res = call <4 x double> @llvm.masked.gather.v4f64 (<4 x double*> %ptrs, i32 8, <4 x i1> , <4 x double> undef) + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64 (<4 x double*> %ptrs, i32 8, <4 x i1> , <4 x double> undef) ;; The gather with all-true mask is equivalent to the following instruction sequence %ptr0 = extractelement <4 x double*> %ptrs, i32 0 @@ -12067,9 +12067,9 @@ :: - declare void @llvm.masked.scatter.v8i32 (<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) - declare void @llvm.masked.scatter.v16f32 (<16 x float> , <16 x float*> , i32 , <16 x i1> ) - declare void @llvm.masked.scatter.v4p0f64 (<4 x double*> , <4 x double**> , i32 , <4 x i1> ) + declare void @llvm.masked.scatter.v8i32.v8p0i32 (<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) + declare void @llvm.masked.scatter.v16f32.v16p0f32 (<16 x float> , <16 x float*> , i32 , <16 x i1> ) + declare void @llvm.masked.scatter.v4p0f64.v4p0p0f64 (<4 x double*> , <4 x double**> , i32 , <4 x i1> ) Overview: """"""""" @@ -12090,7 +12090,7 @@ :: ;; This instruction unconditionally stores data vector in multiple addresses - call @llvm.masked.scatter.v8i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4, <8 x i1> ) + call @llvm.masked.scatter.v8i32.v8p0i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4, <8 x i1> ) ;; It is equivalent to a list of scalar stores %val0 = extractelement <8 x i32> %value, i32 0 Index: include/llvm/IR/Intrinsics.h =================================================================== --- include/llvm/IR/Intrinsics.h +++ include/llvm/IR/Intrinsics.h @@ -100,7 +100,7 @@ Void, VarArg, MMX, Token, Metadata, Half, Float, Double, Integer, Vector, Pointer, Struct, Argument, ExtendArgument, TruncArgument, HalfVecArgument, - SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfPtrsToElt + SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt } Kind; union { @@ -119,21 +119,32 @@ AK_AnyVector, AK_AnyPointer }; + unsigned getArgumentNumber() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == PtrToElt || Kind == VecOfPtrsToElt); + Kind == PtrToElt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || - Kind == SameVecWidthArgument || Kind == PtrToArgument || - Kind == VecOfPtrsToElt); + Kind == SameVecWidthArgument || Kind == PtrToArgument); return (ArgKind)(Argument_Info & 7); } + // VecOfAnyPtrsToElt uses both an overloaded argument (for address space) + // and a reference argument (for matching vector width and element types) + unsigned getOverloadArgNumber() const { + assert(Kind == VecOfAnyPtrsToElt); + return Argument_Info >> 16; + } + unsigned getRefArgNumber() const { + assert(Kind == VecOfAnyPtrsToElt); + return Argument_Info & 0xFFFF; + } + static IITDescriptor get(IITDescriptorKind K, unsigned Field) { IITDescriptor Result = { K, { Field } }; return Result; Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -143,7 +143,7 @@ } class LLVMPointerTo : LLVMMatchType; class LLVMPointerToElt : LLVMMatchType; -class LLVMVectorOfPointersToElt : LLVMMatchType; +class LLVMVectorOfAnyPointersToElt : LLVMMatchType; // Match the type of another intrinsic parameter that is expected to be a // vector type, but change the element count to be half as many @@ -746,14 +746,14 @@ [IntrReadMem, IntrArgMemOnly]>; def int_masked_gather: Intrinsic<[llvm_anyvector_ty], - [LLVMVectorOfPointersToElt<0>, llvm_i32_ty, + [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>], [IntrReadMem]>; def int_masked_scatter: Intrinsic<[], [llvm_anyvector_ty, - LLVMVectorOfPointersToElt<0>, llvm_i32_ty, + LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, LLVMVectorSameWidth<0, llvm_i1_ty>]>; def int_masked_expandload: Intrinsic<[llvm_anyvector_ty], Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -467,6 +467,27 @@ return true; } } + // Renaming gather/scatter intrinsics with no address space overloading + // to the new overload which includes an address space + if (Name.startswith("masked.gather.")) { + Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()}; + if (F->getName() != Intrinsic::getName(Intrinsic::masked_gather, Tys)) { + rename(F); + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::masked_gather, Tys); + return true; + } + } + if (Name.startswith("masked.scatter.")) { + auto Args = F->getFunctionType()->params(); + Type *Tys[] = {Args[0], Args[1]}; + if (F->getName() != Intrinsic::getName(Intrinsic::masked_scatter, Tys)) { + rename(F); + NewFn = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::masked_scatter, Tys); + return true; + } + } break; } case 'n': { @@ -2072,7 +2093,9 @@ case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::masked_load: - case Intrinsic::masked_store: { + case Intrinsic::masked_store: + case Intrinsic::masked_gather: + case Intrinsic::masked_scatter: { SmallVector Args(CI->arg_operands().begin(), CI->arg_operands().end()); NewCall = Builder.CreateCall(NewFn, Args); Index: lib/IR/Function.cpp =================================================================== --- lib/IR/Function.cpp +++ lib/IR/Function.cpp @@ -575,7 +575,7 @@ IIT_SAME_VEC_WIDTH_ARG = 31, IIT_PTR_TO_ARG = 32, IIT_PTR_TO_ELT = 33, - IIT_VEC_OF_PTRS_TO_ELT = 34, + IIT_VEC_OF_ANYPTRS_TO_ELT = 34, IIT_I128 = 35, IIT_V512 = 36, IIT_V1024 = 37 @@ -717,10 +717,11 @@ OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo)); return; } - case IIT_VEC_OF_PTRS_TO_ELT: { - unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); - OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfPtrsToElt, - ArgInfo)); + case IIT_VEC_OF_ANYPTRS_TO_ELT: { + unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfAnyPtrsToElt, + (ArgNo << 16 | RefNo))); return; } case IIT_EMPTYSTRUCT: @@ -809,7 +810,6 @@ Elts[i] = DecodeFixedType(Infos, Tys, Context); return StructType::get(Context, makeArrayRef(Elts,D.Struct_NumElements)); } - case IITDescriptor::Argument: return Tys[D.getArgumentNumber()]; case IITDescriptor::ExtendArgument: { @@ -851,15 +851,9 @@ Type *EltTy = VTy->getVectorElementType(); return PointerType::getUnqual(EltTy); } - case IITDescriptor::VecOfPtrsToElt: { - Type *Ty = Tys[D.getArgumentNumber()]; - VectorType *VTy = dyn_cast(Ty); - if (!VTy) - llvm_unreachable("Expected an argument of Vector Type"); - Type *EltTy = VTy->getVectorElementType(); - return VectorType::get(PointerType::getUnqual(EltTy), - VTy->getNumElements()); - } + case IITDescriptor::VecOfAnyPtrsToElt: + // Return the overloaded type (which determines the pointers address space) + return Tys[D.getOverloadArgNumber()]; } llvm_unreachable("unhandled"); } @@ -1055,11 +1049,23 @@ return (!ThisArgType || !ReferenceType || ThisArgType->getElementType() != ReferenceType->getElementType()); } - case IITDescriptor::VecOfPtrsToElt: { - if (D.getArgumentNumber() >= ArgTys.size()) + case IITDescriptor::VecOfAnyPtrsToElt: { + unsigned RefArgNumber = D.getRefArgNumber(); + + // This may only be used when referring to a previous argument. + if (RefArgNumber >= ArgTys.size()) return true; + + // Record the overloaded type + assert(D.getOverloadArgNumber() == ArgTys.size() && + "Table consistency error"); + ArgTys.push_back(Ty); + + // Verify the overloaded type "matches" the Ref type. + // i.e. Ty is a vector with the same width as Ref. + // Composed of pointers to the same element type as Ref. VectorType * ReferenceType = - dyn_cast (ArgTys[D.getArgumentNumber()]); + dyn_cast (ArgTys[RefArgNumber]); VectorType *ThisArgVecTy = dyn_cast(Ty); if (!ThisArgVecTy || !ReferenceType || (ReferenceType->getVectorNumElements() != Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -293,11 +293,13 @@ Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context), NumElts)); + Type *OverloadedTypes[] = {DataTy, PtrsTy}; Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)}; // We specify only one type when we create this intrinsic. Types of other // arguments are derived from this type. - return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, { DataTy }, Name); + return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes, + Name); } /// \brief Create a call to a Masked Scatter intrinsic. @@ -323,11 +325,13 @@ if (!Mask) Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context), NumElts)); + + Type *OverloadedTypes[] = {DataTy, PtrsTy}; Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask}; // We specify only one type when we create this intrinsic. Types of other // arguments are derived from this type. - return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, { DataTy }); + return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes); } template Index: test/Analysis/CostModel/X86/masked-intrinsic-cost.ll =================================================================== --- test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -78,10 +78,10 @@ ; SKX-LABEL: test_gather_2f64 ; SKX: Found an estimated cost of 7 {{.*}}.gather -%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ret <2 x double> %res } -declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { @@ -94,7 +94,7 @@ ; SKX-LABEL: test_gather_4i32 ; SKX: Found an estimated cost of 6 {{.*}}.gather -%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ret <4 x i32> %res } @@ -109,10 +109,10 @@ ; SKX-LABEL: test_gather_4i32_const_mask ; SKX: Found an estimated cost of 6 {{.*}}.gather -%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) +%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> %src0) ret <4 x i32> %res } -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0) define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) { @@ -128,7 +128,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -146,7 +146,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ret <16 x float>%res } @@ -164,7 +164,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ret <16 x float>%res } @@ -185,7 +185,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -204,7 +204,7 @@ %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind %imask = bitcast i16 %mask to <16 x i1> - call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) ret void } @@ -218,11 +218,11 @@ ; SKX-LABEL: test_scatter_8i32 ; SKX: Found an estimated cost of 10 {{.*}}.scatter - call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask) ret void } -declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask) define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; AVX2-LABEL: test_scatter_4i32 @@ -234,7 +234,7 @@ ; SKX-LABEL: test_scatter_4i32 ; SKX: Found an estimated cost of 6 {{.*}}.scatter - call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) ret void } @@ -252,7 +252,7 @@ %sext_ind = sext <4 x i32> %ind to <4 x i64> %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind - %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ret <4 x float>%res } @@ -270,14 +270,14 @@ %sext_ind = sext <4 x i32> %ind to <4 x i64> %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind - %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float>%res } -declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> ) -declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask) -declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) -declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> ) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask) +declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask) +declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>) declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) Index: test/Analysis/CostModel/X86/vector_gep.ll =================================================================== --- test/Analysis/CostModel/X86/vector_gep.ll +++ test/Analysis/CostModel/X86/vector_gep.ll @@ -3,7 +3,7 @@ %struct.S = type { [1000 x i32] } -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){ %temp = insertelement <4 x i64> undef, i64 %base, i32 0 @@ -12,6 +12,6 @@ %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer ;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32] %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector - %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> , <4 x i32> undef) + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %res } Index: test/Assembler/auto_upgrade_intrinsics.ll =================================================================== --- test/Assembler/auto_upgrade_intrinsics.ll +++ test/Assembler/auto_upgrade_intrinsics.ll @@ -85,6 +85,23 @@ ret void } +declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) + +define <2 x double> @tests.masked.gather(<2 x double*> %ptr, <2 x i1> %mask, <2 x double> %passthru) { +; CHECK-LABEL: @tests.masked.gather( +; CHECK: @llvm.masked.gather.v2f64.v2p0f64 + %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptr, i32 1, <2 x i1> %mask, <2 x double> %passthru) + ret <2 x double> %res +} + +declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask) + +define void @tests.masked.scatter(<2 x double*> %ptr, <2 x i1> %mask, <2 x double> %val) { +; CHECK-LABEL: @tests.masked.scatter( +; CHECK: @llvm.masked.scatter.v2f64.v2p0f64 + call void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptr, i32 3, <2 x i1> %mask) + ret void +} declare {}* @llvm.invariant.start(i64, i8* nocapture) nounwind readonly declare void @llvm.invariant.end({}*, i64, i8* nocapture) nounwind Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -54,13 +54,13 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } -declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) -declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) -declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) +declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) ; SCALAR-LABEL: test2 @@ -111,7 +111,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind %imask = bitcast i16 %mask to <16 x i1> - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) ret <16 x float> %res } @@ -152,7 +152,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind %imask = bitcast i16 %mask to <16 x i1> - %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) + %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) ret <16 x i32> %res } @@ -205,8 +205,8 @@ %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind %imask = bitcast i16 %mask to <16 x i1> - %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) - %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) + %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) + %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) %res = add <16 x i32> %gt1, %gt2 ret <16 x i32> %res } @@ -270,13 +270,13 @@ %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind %imask = bitcast i16 %mask to <16 x i1> - call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) - call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) ret void } -declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) -declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) +declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) ; SCALAR-LABEL: test6 @@ -326,9 +326,9 @@ ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0 ; SKX_32-NEXT: retl - %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef) + %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef) - call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> ) + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> ) ret <8 x i32>%a } @@ -384,8 +384,8 @@ %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind %imask = bitcast i8 %mask to <8 x i1> - %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) - %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) + %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) + %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) %res = add <8 x i32> %gt1, %gt2 ret <8 x i32> %res } @@ -444,8 +444,8 @@ ; SKX_32-NEXT: retl %imask = bitcast i16 %mask to <16 x i1> - %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) - %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) + %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) + %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) %res = add <16 x i32> %gt1, %gt2 ret <16 x i32> %res } @@ -522,7 +522,7 @@ %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> , <8 x i32>, <8 x i32> %ind5, <8 x i64> - %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> , <8 x i32> undef) + %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> , <8 x i32> undef) ret <8 x i32> %res } @@ -591,7 +591,7 @@ %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13 - %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> , <8 x i32> undef) + %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> , <8 x i32> undef) ret <8 x i32> %res } @@ -632,7 +632,7 @@ %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -671,7 +671,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -710,7 +710,7 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -772,13 +772,13 @@ %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } -declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) -declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) -declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>) ; Gather smaller than existing instruction define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { @@ -831,7 +831,7 @@ %sext_ind = sext <4 x i32> %ind to <4 x i64> %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind - %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) + %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) ret <4 x float>%res } @@ -890,7 +890,7 @@ %sext_ind = sext <4 x i32> %ind to <4 x i64> %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind - %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) ret <4 x double>%res } @@ -942,15 +942,15 @@ %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind - %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) + %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) ret <2 x double>%res } -declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) -declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) -declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) -declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) -declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) +declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) +declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; @@ -995,7 +995,7 @@ ; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) ret void } @@ -1049,7 +1049,7 @@ ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %gep = getelementptr double, double* %ptr, <4 x i64> %ind - call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) + call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) ret void } @@ -1103,7 +1103,7 @@ ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) + call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) ret void } @@ -1157,12 +1157,12 @@ ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void } ; The result type requires widening -declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { ; @@ -1222,12 +1222,12 @@ ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind - %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) ret <2 x float>%res } -declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) -declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; @@ -1276,7 +1276,7 @@ ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind - %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) + %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32>%res } @@ -1320,7 +1320,7 @@ ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind - %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> , <2 x i32> undef) + %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> , <2 x i32> undef) ret <2 x i32>%res } @@ -1371,7 +1371,7 @@ ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind - %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) + %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) ret <2 x i64>%res } @@ -1418,7 +1418,7 @@ ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind - %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> , <2 x i64> %src0) + %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> , <2 x i64> %src0) ret <2 x i64>%res } @@ -1466,7 +1466,7 @@ ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind - %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> , <2 x float> undef) + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> , <2 x float> undef) ret <2 x float>%res } @@ -1515,7 +1515,7 @@ ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) ret void } @@ -1568,23 +1568,23 @@ %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } ; Check non-power-of-2 case. It should be scalarized. -declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) +declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { ; ALL-LABEL: test30: ; ALL-NOT: gather %sext_ind = sext <3 x i32> %ind to <3 x i64> %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind - %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) + %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) ret <3 x i32>%res } -declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) +declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) ; KNL-LABEL: test31 ; KNL: vpgatherqq @@ -1626,7 +1626,7 @@ ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX_32-NEXT: retl - %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> , <16 x float*> undef) + %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> , <16 x float*> undef) ret <16 x float*>%res } @@ -1672,7 +1672,7 @@ ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; SKX_32-NEXT: retl - %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) + %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) ret <16 x i32> %res } define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { @@ -1749,10 +1749,10 @@ ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: retl - %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) + %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res } -declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) +declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { ; KNL_64-LABEL: test_gather_16f32: ; KNL_64: # BB#0: @@ -1795,7 +1795,7 @@ ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} ; SKX_32-NEXT: vmovaps %zmm2, %zmm0 ; SKX_32-NEXT: retl - %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) ret <16 x float> %res } define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { @@ -1872,10 +1872,10 @@ ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: retl - %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) + %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res } -declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) +declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { ; KNL_64-LABEL: test_scatter_16i32: ; KNL_64: # BB#0: @@ -1918,7 +1918,7 @@ ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) ret void } define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { @@ -1993,10 +1993,10 @@ ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { ; KNL_64-LABEL: test_scatter_16f32: ; KNL_64: # BB#0: @@ -2039,10 +2039,10 @@ ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { ; KNL_64-LABEL: test_scatter_16f64: ; KNL_64: # BB#0: @@ -2115,10 +2115,10 @@ ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl - call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) + call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) ret void } -declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) +declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) { ; KNL_64-LABEL: test_pr28312: @@ -2193,11 +2193,11 @@ ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: retl - %g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) - %g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) - %g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) + %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) + %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) + %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) %a = add <4 x i64> %g1, %g2 %b = add <4 x i64> %a, %g3 ret <4 x i64> %b } -declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>) Index: test/Transforms/FunctionAttrs/readattrs.ll =================================================================== --- test/Transforms/FunctionAttrs/readattrs.ll +++ test/Transforms/FunctionAttrs/readattrs.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -functionattrs -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S | FileCheck %s @x = global i32 0 @@ -68,22 +69,22 @@ } ; CHECK: declare void @llvm.masked.scatter -declare void @llvm.masked.scatter.v4i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>) ; CHECK-NOT: readnone ; CHECK-NOT: readonly ; CHECK: define void @test9 define void @test9(<4 x i32*> %ptrs, <4 x i32>%val) { - call void @llvm.masked.scatter.v4i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1>) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1>) ret void } ; CHECK: declare <4 x i32> @llvm.masked.gather -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) ; CHECK: readonly ; CHECK: define <4 x i32> @test10 define <4 x i32> @test10(<4 x i32*> %ptrs) { - %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1>, <4 x i32>undef) + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1>, <4 x i32>undef) ret <4 x i32> %res } Index: test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll =================================================================== --- test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll +++ test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -basicaa -gvn -S | FileCheck %s -declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) -declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) ; This test ensures that masked scatter and gather operations, which take vectors of pointers, ; do not have pointer aliasing ignored when being processed. @@ -20,18 +20,18 @@ %tmp.i = insertelement <2 x i32*> undef, i32* %tmp.0, i32 0 %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1 ; Read from in1 and in2 - %in1.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in1, i32 1, <2 x i1> , <2 x i32> undef) #1 - %in2.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in2, i32 1, <2 x i1> , <2 x i32> undef) #1 + %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> , <2 x i32> undef) #1 + %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> , <2 x i32> undef) #1 ; Store in1 to the allocas - call void @llvm.masked.scatter.v2i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); ; Read in1 from the allocas ; This gather should alias the scatter we just saw - %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 + %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 ; Store in2 to the allocas - call void @llvm.masked.scatter.v2i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); ; Read in2 from the allocas ; This gather should alias the scatter we just saw, and not be eliminated - %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 + %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 ; Store in2 to out for good measure %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0 %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1 Index: test/Transforms/InstCombine/masked_intrinsics.ll =================================================================== --- test/Transforms/InstCombine/masked_intrinsics.ll +++ test/Transforms/InstCombine/masked_intrinsics.ll @@ -2,8 +2,8 @@ declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0) declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask) -declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru) -declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru) +declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask) define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru) { %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru) @@ -49,7 +49,7 @@ } define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru) { - %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 5, <2 x i1> zeroinitializer, <2 x double> %passthru) + %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 5, <2 x i1> zeroinitializer, <2 x double> %passthru) ret <2 x double> %res ; CHECK-LABEL: @gather_zeromask( @@ -57,7 +57,7 @@ } define void @scatter_zeromask(<2 x double*> %ptrs, <2 x double> %val) { - call void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32 6, <2 x i1> zeroinitializer) + call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 6, <2 x i1> zeroinitializer) ret void ; CHECK-LABEL: @scatter_zeromask( Index: test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll =================================================================== --- test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -36,7 +36,7 @@ ; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]] -; CHECK-NEXT: call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body Index: test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -17,9 +17,9 @@ ;} ;AVX512-LABEL: @foo1 -;AVX512: llvm.masked.load.v16i32 -;AVX512: llvm.masked.gather.v16f32 -;AVX512: llvm.masked.store.v16f32 +;AVX512: llvm.masked.load.v16i32.p0v16i32 +;AVX512: llvm.masked.gather.v16f32.v16p0f32 +;AVX512: llvm.masked.store.v16f32.p0v16f32 ;AVX512: ret void ; Function Attrs: nounwind uwtable @@ -96,8 +96,8 @@ ;AVX512-LABEL: @foo2 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32 -;AVX512: llvm.masked.scatter.v16f32 +;AVX512: llvm.masked.gather.v16f32.v16p0f32 +;AVX512: llvm.masked.scatter.v16f32.v16p0f32 ;AVX512: ret void define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { entry: @@ -171,10 +171,10 @@ ;AVX512-LABEL: @foo3 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32 +;AVX512: llvm.masked.gather.v16f32.v16p0f32 ;AVX512: fadd <16 x float> ;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.scatter.v16f32 +;AVX512: llvm.masked.scatter.v16f32.v16p0f32 ;AVX512: ret void %struct.Out = type { float, float } @@ -233,4 +233,194 @@ for.end: ; preds = %for.cond ret void } -declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>) + +; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1 + +;AVX512-LABEL: @foo2_addrspace +;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1 +;AVX512: llvm.masked.gather.v16f32.v16p1f32 +;AVX512: llvm.masked.scatter.v16f32.v16p1f32 +;AVX512: ret void +define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { +entry: + %in.addr = alloca %struct.In addrspace(1)*, align 8 + %out.addr = alloca float addrspace(1)*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8 + store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1 + %6 = load float, float addrspace(1)* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4 + store float %add, float addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 16 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Same as foo2_addrspace but here only the input has the non-default address space. + +;AVX512-LABEL: @foo2_addrspace2 +;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1 +;AVX512: llvm.masked.gather.v16f32.v16p1f32 +;AVX512: llvm.masked.scatter.v16f32.v16p0f32 +;AVX512: ret void +define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { +entry: + %in.addr = alloca %struct.In addrspace(1)*, align 8 + %out.addr = alloca float addrspace(0)*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8 + store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1 + %6 = load float, float addrspace(1)* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4 + store float %add, float addrspace(0)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 16 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Same as foo2_addrspace but here only the output has the non-default address space. + +;AVX512-LABEL: @foo2_addrspace3 +;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 +;AVX512: llvm.masked.gather.v16f32.v16p0f32 +;AVX512: llvm.masked.scatter.v16f32.v16p1f32 +;AVX512: ret void + +define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { +entry: + %in.addr = alloca %struct.In addrspace(0)*, align 8 + %out.addr = alloca float addrspace(1)*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8 + store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1 + %6 = load float, float addrspace(0)* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4 + store float %add, float addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 16 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} Index: test/Transforms/LoopVectorize/X86/scatter_crash.ll =================================================================== --- test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -23,11 +23,11 @@ ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]] ; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[TMP13]], i32 16, <16 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> [[TMP13]], i32 16, <16 x i1> ) ; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], ; CHECK-NEXT: [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[TMP16]], i32 8, <16 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> [[TMP16]], i32 8, <16 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], ; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], Index: test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll =================================================================== --- test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll +++ test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll @@ -1,8 +1,8 @@ ; XFAIL: * ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s -declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) -declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) ; This test ensures that masked scatter and gather operations, which take vectors of pointers, ; do not have pointer aliasing ignored when being processed. @@ -21,18 +21,18 @@ %tmp.i = insertelement <2 x i32*> undef, i32* %tmp.0, i32 0 %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1 ; Read from in1 and in2 - %in1.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in1, i32 1, <2 x i1> , <2 x i32> undef) #1 - %in2.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in2, i32 1, <2 x i1> , <2 x i32> undef) #1 + %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> , <2 x i32> undef) #1 + %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> , <2 x i32> undef) #1 ; Store in1 to the allocas - call void @llvm.masked.scatter.v2i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); ; Read in1 from the allocas ; This gather should alias the scatter we just saw - %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 + %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 ; Store in2 to the allocas - call void @llvm.masked.scatter.v2i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> ); ; Read in2 from the allocas ; This gather should alias the scatter we just saw, and not be eliminated - %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 + %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> , <2 x i32> undef) #1 ; Store in2 to out for good measure %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0 %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1 Index: test/Verifier/scatter_gather.ll =================================================================== --- test/Verifier/scatter_gather.ll +++ test/Verifier/scatter_gather.ll @@ -0,0 +1,122 @@ +; RUN: not opt -verify < %s 2>&1 | FileCheck %s + +; Mask is not a vector +; CHECK: Intrinsic has incorrect argument type! +define <16 x float> @gather2(<16 x float*> %ptrs, <16 x i1>* %mask, <16 x float> %passthru) { + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1>* %mask, <16 x float> %passthru) + ret <16 x float> %res +} +declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>*, <16 x float>) + +; Mask length != return length +; CHECK: Intrinsic has incorrect argument type! +define <8 x float> @gather3(<8 x float*> %ptrs, <16 x i1> %mask, <8 x float> %passthru) { + %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <16 x i1> %mask, <8 x float> %passthru) + ret <8 x float> %res +} +declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <16 x i1>, <8 x float>) + +; Return type is not a vector +; CHECK: Intrinsic has incorrect return type! +define <8 x float>* @gather4(<8 x float*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) { + %res = call <8 x float>* @llvm.masked.gather.p0v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru) + ret <8 x float>* %res +} +declare <8 x float>* @llvm.masked.gather.p0v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>) + +; Value type is not a vector +; CHECK: Intrinsic has incorrect argument type! +define <8 x float> @gather5(<8 x float*>* %ptrs, <8 x i1> %mask, <8 x float> %passthru) { + %res = call <8 x float> @llvm.masked.gather.v8f32.p0v8p0f32(<8 x float*>* %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru) + ret <8 x float> %res +} +declare <8 x float> @llvm.masked.gather.v8f32.p0v8p0f32(<8 x float*>*, i32, <8 x i1>, <8 x float>) + +; Value type is not a vector of pointers +; CHECK: Intrinsic has incorrect argument type! +define <8 x float> @gather6(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru) { + %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru) + ret <8 x float> %res +} +declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, i32, <8 x i1>, <8 x float>) + +; Value element type != vector of pointers element +; CHECK: Intrinsic has incorrect argument type! +define <8 x float> @gather7(<8 x double*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) { + %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f64(<8 x double*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru) + ret <8 x float> %res +} +declare <8 x float> @llvm.masked.gather.v8f32.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x float>) + +; Value length!= vector of pointers length +; CHECK: Intrinsic has incorrect argument type! +define <8 x float> @gather8(<16 x float*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) { + %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0f32(<16 x float*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru) + ret <8 x float> %res +} +declare <8 x float> @llvm.masked.gather.v8f32.v16p0f32(<16 x float*>, i32, <8 x i1>, <8 x float>) + +; Passthru type doesn't match return type +; CHECK: Intrinsic has incorrect argument type! +define <16 x i32> @gather9(<16 x i32*> %ptrs, <16 x i1> %mask, <8 x i32> %passthru) { + %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <8 x i32> %passthru) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <8 x i32>) + +; Mask is not a vector +; CHECK: Intrinsic has incorrect argument type! +define void @scatter2(<16 x float> %value, <16 x float*> %ptrs, <16 x i1>* %mask) { + call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %value, <16 x float*> %ptrs, i32 4, <16 x i1>* %mask) + ret void +} +declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>*) + +; Mask length != value length +; CHECK: Intrinsic has incorrect argument type! +define void @scatter3(<8 x float> %value, <8 x float*> %ptrs, <16 x i1> %mask) { + call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %value, <8 x float*> %ptrs, i32 4, <16 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <16 x i1>) + +; Value type is not a vector +; CHECK: Intrinsic has incorrect argument type! +define void @scatter4(<8 x float>* %value, <8 x float*> %ptrs, <8 x i1> %mask) { + call void @llvm.masked.scatter.p0v8f32.v8p0f32(<8 x float>* %value, <8 x float*> %ptrs, i32 4, <8 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.p0v8f32.v8p0f32(<8 x float>*, <8 x float*>, i32, <8 x i1>) + +; ptrs is not a vector +; CHECK: Intrinsic has incorrect argument type! +define void @scatter5(<8 x float> %value, <8 x float*>* %ptrs, <8 x i1> %mask) { + call void @llvm.masked.scatter.v8f32.p0v8p0f32(<8 x float> %value, <8 x float*>* %ptrs, i32 4, <8 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v8f32.p0v8p0f32(<8 x float>, <8 x float*>*, i32, <8 x i1>) + +; Value type is not a vector of pointers +; CHECK: Intrinsic has incorrect argument type! +define void @scatter6(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask) { + call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, i32 4, <8 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, i32, <8 x i1>) + +; Value element type != vector of pointers element +; CHECK: Intrinsic has incorrect argument type! +define void @scatter7(<8 x float> %value, <8 x double*> %ptrs, <8 x i1> %mask) { + call void @llvm.masked.scatter.v8f32.v8p0f64(<8 x float> %value, <8 x double*> %ptrs, i32 4, <8 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v8f32.v8p0f64(<8 x float>, <8 x double*>, i32, <8 x i1>) + +; Value length!= vector of pointers length +; CHECK: Intrinsic has incorrect argument type! +define void @scatter8(<8 x float> %value, <16 x float*> %ptrs, <8 x i1> %mask) { + call void @llvm.masked.scatter.v8f32.v16p0f32(<8 x float> %value, <16 x float*> %ptrs, i32 4, <8 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v8f32.v16p0f32(<8 x float>, <16 x float*>, i32, <8 x i1>) + Index: utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- utils/TableGen/IntrinsicEmitter.cpp +++ utils/TableGen/IntrinsicEmitter.cpp @@ -214,7 +214,7 @@ IIT_SAME_VEC_WIDTH_ARG = 31, IIT_PTR_TO_ARG = 32, IIT_PTR_TO_ELT = 33, - IIT_VEC_OF_PTRS_TO_ELT = 34, + IIT_VEC_OF_ANYPTRS_TO_ELT = 34, IIT_I128 = 35, IIT_V512 = 36, IIT_V1024 = 37 @@ -276,8 +276,16 @@ } else if (R->isSubClassOf("LLVMPointerTo")) Sig.push_back(IIT_PTR_TO_ARG); - else if (R->isSubClassOf("LLVMVectorOfPointersToElt")) - Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT); + else if (R->isSubClassOf("LLVMVectorOfAnyPointersToElt")) { + Sig.push_back(IIT_VEC_OF_ANYPTRS_TO_ELT); + unsigned ArgNo = ArgCodes.size(); + ArgCodes.push_back(3 /*vAny*/); + // Encode overloaded ArgNo + Sig.push_back(ArgNo); + // Encode LLVMMatchType ArgNo + Sig.push_back(Number); + return; + } else if (R->isSubClassOf("LLVMPointerToElt")) Sig.push_back(IIT_PTR_TO_ELT); else