Index: llvm/lib/CodeGen/ExpandVectorPredication.cpp =================================================================== --- llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -83,7 +84,23 @@ /// \returns Whether the vector mask \p MaskVal has all lane bits set. static bool isAllTrueMask(Value *MaskVal) { auto *ConstVec = dyn_cast(MaskVal); - return ConstVec && ConstVec->isAllOnesValue(); + if (ConstVec && ConstVec->isAllOnesValue()) + return true; + + if (isa(MaskVal->getType())) { + Value *SplattedVal = nullptr; + if (match(MaskVal, + m_Shuffle(m_InsertElt(PatternMatch::m_Undef(), + PatternMatch::m_Value(SplattedVal), + PatternMatch::m_ZeroInt()), + PatternMatch::m_Undef(), PatternMatch::m_ZeroMask()))) { + auto *ConstValue = dyn_cast(SplattedVal); + if (ConstValue && ConstValue->isOneValue()) + return true; + } + } + + return false; } /// \returns A non-excepting divisor constant for this type. @@ -161,6 +178,10 @@ Value *expandPredicationInReduction(IRBuilder<> &Builder, VPReductionIntrinsic &PI); + /// \brief Lower this VP memory operation to a non-VP intrinsic. + Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI); + /// \brief Query TTI and expand the vector predication in \p P accordingly. Value *expandPredication(VPIntrinsic &PI); @@ -381,6 +402,53 @@ return Reduction; } +Value * +CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder, + VPIntrinsic &VPI) { + assert(VPI.canIgnoreVectorLengthParam()); + + Value *MaskParam = VPI.getMaskParam(); + Value *PtrParam = VPI.getMemoryPointerParam(); + Value *DataParam = VPI.getMemoryDataParam(); + bool IsUnmasked = isAllTrueMask(MaskParam); + + MaybeAlign AlignOpt = VPI.getPointerAlignment(); + + Value *NewMemoryInst = nullptr; + switch (VPI.getIntrinsicID()) { + default: + llvm_unreachable("Not a VP memory intrinsic"); + case Intrinsic::vp_store: + if (IsUnmasked) { + StoreInst *NewStore = + Builder.CreateStore(DataParam, PtrParam, /*IsVolatile*/ false); + if (AlignOpt.hasValue()) + NewStore->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewStore; + } else + NewMemoryInst = Builder.CreateMaskedStore( + DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam); + + break; + case Intrinsic::vp_load: + if (IsUnmasked) { + LoadInst *NewLoad = + Builder.CreateLoad(VPI.getType(), PtrParam, /*IsVolatile*/ false); + if (AlignOpt.hasValue()) + NewLoad->setAlignment(AlignOpt.getValue()); + NewMemoryInst = NewLoad; + } else + NewMemoryInst = Builder.CreateMaskedLoad( + VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam); + + break; + } + + assert(NewMemoryInst); + replaceOperation(*NewMemoryInst, VPI); + return NewMemoryInst; +} + void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n"); @@ -457,6 +525,14 @@ if (auto *VPRI = dyn_cast(&VPI)) return expandPredicationInReduction(Builder, *VPRI); + switch (VPI.getIntrinsicID()) { + default: + break; + case Intrinsic::vp_load: + case Intrinsic::vp_store: + return expandPredicationInMemoryIntrinsic(Builder, VPI); + } + return &VPI; } Index: llvm/test/CodeGen/Generic/expand-vp-load-store.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Generic/expand-vp-load-store.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt --expandvp -S < %s | FileCheck %s +; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s + +; Fixed vectors +define <2 x i64> @vpload_v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_v2i64( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 %evl) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_vlmax(<2 x i64>* %ptr, <2 x i1> %m) { +; CHECK-LABEL: @vpload_v2i64_vlmax( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 2) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_allones_mask(<2 x i64>* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_v2i64_allones_mask( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> , i32 %evl) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_allones_mask_vlmax(<2 x i64>* %ptr) { +; CHECK-LABEL: @vpload_v2i64_allones_mask_vlmax( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[PTR:%.*]], align 16 +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> , i32 2) + ret <2 x i64> %load +} + +define <2 x i64> @vpload_v2i64_constexpr_allones_mask_vlmax(<2 x i64>* %ptr) { +; CHECK-LABEL: @vpload_v2i64_constexpr_allones_mask_vlmax( +; CHECK-NEXT: [[ONE:%.*]] = insertelement <2 x i1> poison, i1 true, i32 0 +; CHECK-NEXT: [[ALLONES:%.*]] = shufflevector <2 x i1> [[ONE]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[ALLONES]], <2 x i64> undef) +; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; + %one = insertelement <2 x i1> poison, i1 true, i32 0 + %allones = shufflevector <2 x i1> %one, <2 x i1> poison, <2 x i32> zeroinitializer + %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %allones, i32 2) + ret <2 x i64> %load +} + +define void @vpstore_v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_v2i64( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 %evl) + ret void +} + +define void @vpstore_v2i64_vlmax(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m) { +; CHECK-LABEL: @vpstore_v2i64_vlmax( +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 2) + ret void +} + +define void @vpstore_v2i64_allones_mask(<2 x i64> %val, <2 x i64>* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_v2i64_allones_mask( +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> , i32 %evl) + ret void +} + +define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, <2 x i64>* %ptr) { +; CHECK-LABEL: @vpstore_v2i64_allones_mask_vlmax( +; CHECK-NEXT: store <2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], align 16 +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> , i32 2) + ret void +} + +define void @vpstore_v2i64_constexpr_allones_mask_vlmax(<2 x i64> %val, <2 x i64>* %ptr) { +; CHECK-LABEL: @vpstore_v2i64_constexpr_allones_mask_vlmax( +; CHECK-NEXT: [[ONE:%.*]] = insertelement <2 x i1> poison, i1 true, i32 0 +; CHECK-NEXT: [[ALLONES:%.*]] = shufflevector <2 x i1> [[ONE]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[ALLONES]]) +; CHECK-NEXT: ret void +; + %one = insertelement <2 x i1> poison, i1 true, i32 0 + %allones = shufflevector <2 x i1> %one, <2 x i1> poison, <2 x i32> zeroinitializer + call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %allones, i32 2) + ret void +} + +; Scalable vectors +define @vpload_nxv1i64(* %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_nxv1i64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.masked.load.nxv1i64.p0nxv1i64(* [[PTR:%.*]], i32 1, [[TMP2]], undef) +; CHECK-NEXT: ret [[TMP3]] +; + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, %m, i32 %evl) + ret %load +} + +define @vpload_nxv1i64_vscale(* %ptr, %m) { +; CHECK-LABEL: @vpload_nxv1i64_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.masked.load.nxv1i64.p0nxv1i64(* [[PTR:%.*]], i32 1, [[M:%.*]], undef) +; CHECK-NEXT: ret [[TMP1]] +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, %m, i32 %vlmax) + ret %load +} + +define @vpload_nxv1i64_allones_mask(* %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpload_nxv1i64_allones_mask( +; CHECK-NEXT: [[ONE:%.*]] = insertelement poison, i1 true, i32 0 +; CHECK-NEXT: [[ALLONES:%.*]] = shufflevector [[ONE]], poison, zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[ALLONES]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.masked.load.nxv1i64.p0nxv1i64(* [[PTR:%.*]], i32 1, [[TMP2]], undef) +; CHECK-NEXT: ret [[TMP3]] +; + %one = insertelement poison, i1 true, i32 0 + %allones = shufflevector %one, poison, zeroinitializer + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, %allones, i32 %evl) + ret %load +} + +define @vpload_nxv1i64_allones_mask_vscale(* %ptr) { +; CHECK-LABEL: @vpload_nxv1i64_allones_mask_vscale( +; CHECK-NEXT: [[ONE:%.*]] = insertelement poison, i1 true, i32 0 +; CHECK-NEXT: [[ALLONES:%.*]] = shufflevector [[ONE]], poison, zeroinitializer +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = load , * [[PTR:%.*]], align 8 +; CHECK-NEXT: ret [[TMP1]] +; + %one = insertelement poison, i1 true, i32 0 + %allones = shufflevector %one, poison, zeroinitializer + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + %load = call @llvm.vp.load.nxv1i64.p0nxv1i64(* %ptr, %allones, i32 %vlmax) + ret %load +} + +define void @vpstore_nxv1i64( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_nxv1i64( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[M:%.*]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64( [[VAL:%.*]], * [[PTR:%.*]], i32 1, [[TMP2]]) +; CHECK-NEXT: ret void +; + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, %m, i32 %evl) + ret void +} + +define void @vpstore_nxv1i64_vscale( %val, * %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_nxv1i64_vscale( +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64( [[VAL:%.*]], * [[PTR:%.*]], i32 1, [[M:%.*]]) +; CHECK-NEXT: ret void +; + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, %m, i32 %vlmax) + ret void +} + +define void @vpstore_nxv1i64_allones_mask( %val, * %ptr, i32 zeroext %evl) { +; CHECK-LABEL: @vpstore_nxv1i64_allones_mask( +; CHECK-NEXT: [[ONE:%.*]] = insertelement poison, i1 true, i32 0 +; CHECK-NEXT: [[ALLONES:%.*]] = shufflevector [[ONE]], poison, zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], [[ALLONES]] +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64( [[VAL:%.*]], * [[PTR:%.*]], i32 1, [[TMP2]]) +; CHECK-NEXT: ret void +; + %one = insertelement poison, i1 true, i32 0 + %allones = shufflevector %one, poison, zeroinitializer + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, %allones, i32 %evl) + ret void +} + +define void @vpstore_nxv1i64_allones_mask_vscale( %val, * %ptr) { +; CHECK-LABEL: @vpstore_nxv1i64_allones_mask_vscale( +; CHECK-NEXT: [[ONE:%.*]] = insertelement poison, i1 true, i32 0 +; CHECK-NEXT: [[ALLONES:%.*]] = shufflevector [[ONE]], poison, zeroinitializer +; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 +; CHECK-NEXT: store [[VAL:%.*]], * [[PTR:%.*]], align 8 +; CHECK-NEXT: ret void +; + %one = insertelement poison, i1 true, i32 0 + %allones = shufflevector %one, poison, zeroinitializer + %vscale = call i32 @llvm.vscale.i32() + %vlmax = mul nuw i32 %vscale, 1 + call void @llvm.vp.store.nxv1i64.p0nxv1i64( %val, * %ptr, %allones, i32 %vlmax) + ret void +} + +declare i32 @llvm.vscale.i32() + +declare <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>*, <2 x i1>, i32) +declare @llvm.vp.load.nxv1i64.p0nxv1i64(*, , i32) +declare void @llvm.vp.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, <2 x i1>, i32) +declare void @llvm.vp.store.nxv1i64.p0nxv1i64(, *, , i32)