Index: llvm/lib/CodeGen/ExpandVectorPredication.cpp =================================================================== --- llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -70,9 +70,17 @@ #undef VPINTERNAL_VPLEGAL_CASES +// VP strided load/store -> VP gather/scatter transformation +static cl::opt StridedOnlyTransformation( + "expandvp-strided-only", cl::init(false), cl::Hidden, + cl::desc("If true, ignore TargetTransformInfo and always expand " + "experimental.vp.strided.load/store intrinsics to " + "vp.gather/scatter ones (Used in testing).")); + // Whether any override options are set. static bool anyExpandVPOverridesSet() { - return !EVLTransformOverride.empty() || !MaskTransformOverride.empty(); + return !EVLTransformOverride.empty() || !MaskTransformOverride.empty() || + StridedOnlyTransformation; } #define DEBUG_TYPE "expandvp" @@ -167,6 +175,11 @@ Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, VPIntrinsic &VPI); + /// \brief Lower this VP strided either to a VP gather/scatter, if legal for + /// the target, or to an unpredicated masked gather/scatter. + Value *expandPredicationInStridedLoadStore(IRBuilder<> &Builder, + VPIntrinsic &PI); + /// \brief Query TTI and expand the vector predication in \p P accordingly. Value *expandPredication(VPIntrinsic &PI); @@ -527,6 +540,9 @@ case Intrinsic::vp_gather: case Intrinsic::vp_scatter: return expandPredicationInMemoryIntrinsic(Builder, VPI); + case Intrinsic::experimental_vp_strided_load: + case Intrinsic::experimental_vp_strided_store: + return expandPredicationInStridedLoadStore(Builder, VPI); } return &VPI; @@ -574,11 +590,91 @@ // Overrides set - we are in testing, the following does not need to be // efficient. + + // If StridedOnlyTransformation is true, EVLTransformOverride and + // MaskTransformOverride are ignored + if (StridedOnlyTransformation) { + VPStrat.EVLParamStrategy = VPLegalization::Legal; + switch (VPI.getIntrinsicID()) { + default: + VPStrat.OpStrategy = VPLegalization::Legal; + break; + case Intrinsic::experimental_vp_strided_load: + case Intrinsic::experimental_vp_strided_store: + VPStrat.OpStrategy = VPLegalization::Convert; + break; + } + return VPStrat; + } + VPStrat.EVLParamStrategy = parseOverrideOption(EVLTransformOverride); VPStrat.OpStrategy = parseOverrideOption(MaskTransformOverride); return VPStrat; } +Value * +CachingVPExpander::expandPredicationInStridedLoadStore(IRBuilder<> &Builder, + VPIntrinsic &PI) { + auto CreateGatherScatterPointer = [&](VectorType *VT, + Value *Stride) -> Value * { + Value *MemPtr = PI.getMemoryPointerParam(); + ElementCount EC = VT->getElementCount(); + // Bitcast the MemPtr to a pointer to i8 + Value *Cast = Builder.CreateBitCast(MemPtr, Builder.getInt8PtrTy()); + // Create a vector of pointers %addrs in the form: + // %addrs = <%addr, %addr + %stride, %addr + 2 * %stride, ...> + Value *Splat = Builder.CreateVectorSplat(EC, Stride); + Value *StepVector = + Builder.CreateStepVector(VectorType::get(Builder.getInt64Ty(), EC)); + Value *Indexes = + Builder.CreateBinOp(Instruction::BinaryOps::Mul, StepVector, Splat); + Value *GEP = Builder.CreateGEP(Builder.getInt8Ty(), Cast, Indexes); + // Bitcast the generated array of pointers to i8 to pointers to the original + // element type + VectorType *DstType = + VectorType::get(VT->getElementType()->getPointerTo(), EC); + return Builder.CreateBitCast(GEP, DstType); + }; + + unsigned IntrNo = Intrinsic::not_intrinsic; + SmallVector Types; + SmallVector Args; + switch (PI.getIntrinsicID()) { + default: + llvm_unreachable("Not a vp.strided.* intrinsic"); + case Intrinsic::experimental_vp_strided_load: { + IntrNo = Intrinsic::vp_gather; + VectorType *VT = cast(PI.getType()); + Value *VecOfPtrs = CreateGatherScatterPointer(VT, PI.getOperand(1)); + Types = {VT, VecOfPtrs->getType()}; + Args = {VecOfPtrs, PI.getMaskParam(), PI.getVectorLengthParam()}; + break; + } + case Intrinsic::experimental_vp_strided_store: { + IntrNo = Intrinsic::vp_scatter; + VectorType *VT = cast(PI.getMemoryDataParam()->getType()); + Value *VecOfPtrs = CreateGatherScatterPointer(VT, PI.getOperand(2)); + Types = {VT, VecOfPtrs->getType()}; + Args = {PI.getMemoryDataParam(), VecOfPtrs, PI.getMaskParam(), + PI.getVectorLengthParam()}; + break; + } + } + + auto *Intr = Builder.CreateIntrinsic(IntrNo, Types, Args); + replaceOperation(*Intr, PI); + + VPIntrinsic *VPIntr = cast(Intr); + auto VPStrat = getVPLegalizationStrategy(*VPIntr); + sanitizeStrategy(*Intr, VPStrat); + if (!VPStrat.shouldDoNothing()) { + Builder.SetInsertPoint(VPIntr); + return expandPredicationInMemoryIntrinsic(Builder, *VPIntr); + } + + return Intr; +} + /// \brief Expand llvm.vp.* intrinsics as requested by \p TTI. bool CachingVPExpander::expandVectorPredication() { SmallVector Worklist; @@ -615,7 +711,7 @@ } Job.Strategy.EVLParamStrategy = VPLegalization::Legal; - // Replace with a non-predicated operation. + // Replace with a non-predicated operation or another legal VP operation. switch (Job.Strategy.OpStrategy) { case VPLegalization::Legal: break; Index: llvm/test/CodeGen/Generic/expand-vp-strided-accesses.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Generic/expand-vp-strided-accesses.ll @@ -0,0 +1,47 @@ +; RUN: opt --expandvp --expandvp-strided-only -S < %s | FileCheck %s --check-prefixes=CHECK,VP_TO_VP +; RUN: opt --expandvp -S < %s | FileCheck %s --check-prefixes=CHECK,VP_TO_NOT_VP + +define @strided_load(float* %ptr, i64 %stride, %mask, i32 zeroext %evl) { + %v = call @llvm.experimental.vp.strided.load.nxv2f32(float* %ptr, i64 %stride, %mask, i32 %evl) + ret %v +} + +; CHECK: [[LANE:%.+]] = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 %evl) +; CHECK-NEXT: [[MASK:%.+]] = and [[LANE]], %mask +; CHECK-NEXT: [[VSCALE:%.+]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCAL_SIZE:%.+]] = mul nuw i32 [[VSCALE]], 2 +; CHECK-NEXT: [[BITCAST:%.+]] = bitcast float* %ptr to i8* +; CHECK-NEXT: [[NINS:%.+]] = insertelement poison, i64 %stride, i32 0 +; CHECK-NEXT: [[NSPLAT:%.+]] = shufflevector [[NINS]], poison, zeroinitializer +; CHECK-NEXT: [[STEPVECTOR:%.+]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[INDICES:%.+]] = mul [[STEPVECTOR]], [[NSPLAT]] +; CHECK-NEXT: [[GEP:%.+]] = getelementptr i8, i8* [[BITCAST]], [[INDICES]] +; CHECK-NEXT: [[PTRS:%.+]] = bitcast [[GEP]] to + +; VP_TO_NOT_VP: call @llvm.masked.gather.nxv2f32.nxv2p0f32( [[PTRS]], i32 1, [[MASK]], undef) + +; VP_TO_VP: call @llvm.vp.gather.nxv2f32.nxv2p0f32( [[PTRS]], [[MASK]], i32 [[SCAL_SIZE]]) + +define void @strided_store( %v, float *%ptr, i64 %stride, %mask, i32 zeroext %evl) { + call void @llvm.experimental.vp.strided.store.nxv2f32( %v, float* %ptr, i64 %stride, %mask, i32 %evl) + ret void +} + +; CHECK: [[LANE:%.+]] = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 %evl) +; CHECK-NEXT: [[MASK:%.+]] = and [[LANE]], %mask +; CHECK-NEXT: [[VSCALE:%.+]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCAL_SIZE:%.+]] = mul nuw i32 [[VSCALE]], 2 +; CHECK-NEXT: [[BITCAST:%.+]] = bitcast float* %ptr to i8* +; CHECK-NEXT: [[NINS:%.+]] = insertelement poison, i64 %stride, i32 0 +; CHECK-NEXT: [[NSPLAT:%.+]] = shufflevector [[NINS]], poison, zeroinitializer +; CHECK-NEXT: [[STEPVECTOR:%.+]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[INDICES:%.+]] = mul [[STEPVECTOR]], [[NSPLAT]] +; CHECK-NEXT: [[GEP:%.+]] = getelementptr i8, i8* [[BITCAST]], [[INDICES]] +; CHECK-NEXT: [[PTRS:%.+]] = bitcast [[GEP]] to + +; VP_TO_NOT_VP: call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( %v, [[PTRS]], i32 1, [[MASK]]) + +; VP_TO_VP: call void @llvm.vp.scatter.nxv2f32.nxv2p0f32( %v, [[PTRS]], [[MASK]], i32 [[SCAL_SIZE]]) + +declare @llvm.experimental.vp.strided.load.nxv2f32(float*, i64, , i32) +declare void @llvm.experimental.vp.strided.store.nxv2f32(, float*, i64, , i32)