diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -82,8 +83,11 @@ /// \returns Whether the vector mask \p MaskVal has all lane bits set. static bool isAllTrueMask(Value *MaskVal) { - auto *ConstVec = dyn_cast(MaskVal); - return ConstVec && ConstVec->isAllOnesValue(); + if (Value *SplattedVal = getSplatValue(MaskVal)) + if (auto *ConstValue = dyn_cast(SplattedVal)) + return ConstValue->isAllOnesValue(); + + return false; } /// \returns A non-excepting divisor constant for this type. @@ -171,6 +175,10 @@ Value *expandPredicationInReduction(IRBuilder<> &Builder, VPReductionIntrinsic &PI); + /// \brief Lower this VP memory operation to a non-VP intrinsic. + Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI); + /// \brief Query TTI and expand the vector predication in \p P accordingly. Value *expandPredication(VPIntrinsic &PI); @@ -389,6 +397,53 @@ return Reduction; } +Value * +CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { + assert(VPI.canIgnoreVectorLengthParam()); + + Value *MaskParam = VPI.getMaskParam(); + Value *PtrParam = VPI.getMemoryPointerParam(); + Value *DataParam = VPI.getMemoryDataParam(); + bool IsUnmasked = isAllTrueMask(MaskParam); + + MaybeAlign AlignOpt = VPI.getPointerAlignment(); + + Value *NewMemoryInst = nullptr; + switch (VPI.getIntrinsicID()) { + default: + llvm_unreachable("Not a VP memory intrinsic"); + case Intrinsic::vp_store: + if (IsUnmasked) { + StoreInst *NewStore = + Builder.CreateStore(DataParam, PtrParam, /*IsVolatile*/ false); + if (AlignOpt.hasValue()) + NewStore->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewStore; + } else + NewMemoryInst = Builder.CreateMaskedStore( + DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam); + + break; + case Intrinsic::vp_load: + if (IsUnmasked) { + LoadInst *NewLoad = + Builder.CreateLoad(VPI.getType(), PtrParam, /*IsVolatile*/ false); + if (AlignOpt.hasValue()) + NewLoad->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewLoad; + } else + NewMemoryInst = Builder.CreateMaskedLoad( + VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam); + + break; + } + + assert(NewMemoryInst); + replaceOperation(*NewMemoryInst, VPI); + return NewMemoryInst; +} + void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n"); @@ -465,6 +520,14 @@ if (auto *VPRI = dyn_cast(&VPI)) return expandPredicationInReduction(Builder, *VPRI); + switch (VPI.getIntrinsicID()) { + default: + break; + case Intrinsic::vp_load: + case Intrinsic::vp_store: + return expandPredicationInMemoryIntrinsic(Builder, VPI); + } + return &VPI; } diff --git a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll @@ -0,0 +1,205 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt --expandvp -S < %s | FileCheck %s +; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s + +; Fixed vectors +define <2 x i64> @vpload_v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_v2i64( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 %evl) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_vlmax(<2 x i64>* %ptr, <2 x i1> %m) { +; CHECK-LABEL: @vpload_v2i64_vlmax( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 2) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_allones_mask(<2 x i64>* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_v2i64_allones_mask( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> , i32 %evl) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_allones_mask_vlmax(<2 x i64>* %ptr) { +; CHECK-LABEL: @vpload_v2i64_allones_mask_vlmax( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[PTR:%.*]], align 16 +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> , i32 2) + ret <2 x i64> %load +} + +define void @vpstore_v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_v2i64( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 %evl) + ret void +} + +define void @vpstore_v2i64_vlmax(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m) { +; CHECK-LABEL: @vpstore_v2i64_vlmax( +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 2) + ret void +} + +define void @vpstore_v2i64_allones_mask(<2 x i64> %val, <2 x i64>* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_v2i64_allones_mask( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> , i32 %evl) + ret void +} + +define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, <2 x i64>* %ptr) { +; CHECK-LABEL: @vpstore_v2i64_allones_mask_vlmax( +; CHECK-NEXT: store <2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], align 16 +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> , i32 2) + ret void +} + +; Scalable vectors +define @vpload_nxv1i64(* %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_nxv1i64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.masked.load.nxv1i64.p0nxv1i64(* [[PTR:%.*]], i32 1, [[TMP2]], undef) +; CHECK-NEXT: ret [[TMP3]] +; + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, %m, i32 %evl) + ret %load +} + +define @vpload_nxv1i64_vscale(* %ptr, %m) { +; CHECK-LABEL: @vpload_nxv1i64_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.masked.load.nxv1i64.p0nxv1i64(* [[PTR:%.*]], i32 1, [[M:%.*]], undef) +; CHECK-NEXT: ret [[TMP1]] +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, %m, i32 %vlmax) + ret %load +} + +define @vpload_nxv1i64_allones_mask(* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_nxv1i64_allones_mask( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.masked.load.nxv1i64.p0nxv1i64(* [[PTR:%.*]], i32 1, [[TMP2]], undef) +; CHECK-NEXT: ret [[TMP3]] +; + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %evl) + ret %load +} + +define @vpload_nxv1i64_allones_mask_vscale(* %ptr) { +; CHECK-LABEL: @vpload_nxv1i64_allones_mask_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = load , * [[PTR:%.*]], align 8 +; CHECK-NEXT: ret [[TMP1]] +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %vlmax) + ret %load +} + +define void @vpstore_nxv1i64( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_nxv1i64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64( [[VAL:%.*]], * [[PTR:%.*]], i32 1, [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, %m, i32 %evl) + ret void +} + +define void @vpstore_nxv1i64_vscale( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_nxv1i64_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64( [[VAL:%.*]], * [[PTR:%.*]], i32 1, [[M:%.*]]) +; CHECK-NEXT: ret void +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, %m, i32 %vlmax) + ret void +} + +define void @vpstore_nxv1i64_allones_mask( %val, * %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_nxv1i64_allones_mask( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64( [[VAL:%.*]], * [[PTR:%.*]], i32 1, [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %evl) + ret void +} + +define void @vpstore_nxv1i64_allones_mask_vscale( %val, * %ptr) { +; CHECK-LABEL: @vpstore_nxv1i64_allones_mask_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: store [[VAL:%.*]], * [[PTR:%.*]], align 8 +; CHECK-NEXT: ret void +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %vlmax) + ret void +} + +declare i32 @llvm.vscale.i32() + +declare <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>*, <2 x i1>, i32) +declare void @llvm.vp.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, <2 x i1>, i32) + +declare @llvm.vp.load.nxv1i64.p0nxv1i64(*, , i32) +declare void @llvm.vp.store.nxv1i64.p0nxv1i64(, *, , i32)