Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -16158,6 +16158,68 @@ %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef +.. _int_get_active_lane_mask: + +'``llvm.get.active.lane.mask.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %VIV[0], i32 %BTC) + declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %VIV[0], i64 %BTC) + declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64.v16i64(i64 %VIV[0], i64 %BTC) + + +Overview: +""""""""" + +Create a mask representing active and inactive vector lanes. + + +Arguments: +"""""""""" + +Both operands have the same scalar integer type. The first operand is the first +element of the Vector Induction Variable (VIV), denoted by ``%VIV[0].`` The second +operand is the scalar loop Back-edge Taken Count (BTC). The result is a vector +with the same number of elements as the VIV, but with the i1 element value +type. + +The arguments are scalar types to accomodate scalable vector types, for which +it is unknown what the the type of the step vector needs to be that enumerate +its lanes without overflow. + + +Semantics: +"""""""""" + +The '``llvm.get.active.lane.mask.*``' intrinsics are semantically equivalent to +`icmp ule %VIV, (splat %BTC)`, where VIV is the Vector Induction Variable, and +BTC the back-edge taken count splat into a vector, except when the vector +induction variable overflows, in which case they return false in the lanes +where the VIV overflows. Thus, these intrinsics perform an element-wise less +than or equal comparison of VIV with BTC, producing a mask of true/false values +representing active/inactive vector lanes. This mask can e.g. be used in the +masked load/store instructions. These intrinsics provides a hint to the +backend. I.e., for a vector loop, the back-edge taken count of the original +scalar loop is explicit as the second argument. + + +Examples: +""""""""" + +.. code-block:: llvm + + %induction = add <4 x i64> %broadcast.splat, + %elem0 = extractelement <4 x i32> %induction, i32 0 + %get.active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429) + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %get.active.lane.mask, <4 x i32> undef) + + .. _int_mload_mstore: Masked Vector Load and Store Intrinsics Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -532,6 +532,11 @@ DominatorTree *DT, const LoopAccessInfo *LAI) const; + /// Query the target whether lowering of the llvm.get.active.lane.mask + /// intrinsic is supported and if emitting it is desired for this loop. + bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) const; + /// @} /// \name Scalar Target Information @@ -1265,6 +1270,8 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; + virtual bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1556,6 +1563,10 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } + bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) override { + return Impl.emitGetActiveLaneMask(L, LI, SE, TailFolded); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -140,6 +140,11 @@ return false; } + bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFold) const { + return false; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -486,6 +486,11 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } + bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFold) { + return BaseT::emitGetActiveLaneMask(L, LI, SE, TailFold); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1235,6 +1235,10 @@ } +def int_get_active_lane_mask: + Intrinsic<[llvm_anyvector_ty], + [llvm_anyint_ty, LLVMMatchType<1>], + [IntrNoMem, IntrNoSync, IntrWillReturn]>; //===-------------------------- Masked Intrinsics -------------------------===// // Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -307,6 +307,11 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } +bool TargetTransformInfo::emitGetActiveLaneMask(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, bool TailFolded) const { + return TTIImpl->emitGetActiveLaneMask(L, LI, SE, TailFolded); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -250,6 +250,9 @@ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) const; + bool shouldBuildLookupTablesForConstant(Constant *C) const { // In the ROPI and RWPI relocation models we can't have pointers to global // variables or functions in constant data, so don't convert switches to Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1385,7 +1385,16 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } - +bool ARMTTIImpl::emitGetActiveLaneMask(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, bool TailFolded) const { + // If this loop is tail-folded, we always want to emit the + // llvm.get.active.lane.mask intrinsic, so that this can be picked up in the + // MVETailPredication pass that needs to know the number of elements + // processed by this vector loop. + if (TailFolded) + return true; + return false; +} void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6821,7 +6821,11 @@ IV = IVRecipe->getVPValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + if (CM.TTI.emitGetActiveLaneMask(CM.TheLoop, CM.LI, *CM.PSE.getSE(), + !CM.isScalarEpilogueAllowed())) + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); + else + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -675,6 +675,7 @@ ICmpULE, SLPLoad, SLPStore, + ActiveLaneMask, }; private: Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -380,6 +380,55 @@ State.set(this, V, Part); break; } + case VPInstruction::ActiveLaneMask: { + // The vector induction variable. + Value *VIV = State.get(getOperand(0), Part); + // The Back-edge Taken Count (BTC) splat to a vector. + Value *SplatBTC = State.get(getOperand(1), Part); + + // Create the intrinsic call, which has equivalent semantics to + // icmp ule %VIV, %SplatBTC + // generating the mask of active/inactive lanes. + // + // We have 2 cases: incrementing loops, and decrementing loops, which means + // the BTC looks slightly different. For incrementing loops, it is this: + // + // VIV: %induction = add <4 x i32> %broadcast.splat, + // BTC: <4 x i32> + // + // and for decrementing loops, it is: + // + // VIV: %vec.iv = add <16 x i32> %broadcast.splat, + // BTC: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + + Value *ScalarBTC = nullptr; + if (auto *CDV = dyn_cast(SplatBTC)) + ScalarBTC = CDV->getSplatValue(); + else if (auto *ShuffleVec = dyn_cast(SplatBTC)) { + // Extract the scalar BTC from the insertelement instruction, which is its + // 2nd operand. + Instruction *SplatInsert = dyn_cast( + ShuffleVec->getOperand(0)); + assert(SplatInsert && "Unexpected backedge taken count form"); + ScalarBTC = SplatInsert->getOperand(1); + } + else + llvm_unreachable("Unexpected vector induction variable"); + + auto *VIVElem0 = Builder.CreateExtractElement( + VIV, ConstantInt::get(ScalarBTC->getType(), 0)); + auto *I = dyn_cast(VIV); + auto *Int1Ty = Type::getInt1Ty(I->getParent()->getContext()); + auto *VecTy = dyn_cast(VIV->getType()); + auto *PredTy = VectorType::get(Int1Ty, VecTy->getNumElements()); + + Instruction *Call = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, + {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); + + State.set(this, Call, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -421,6 +470,10 @@ case VPInstruction::SLPStore: O << "combined store"; break; + case VPInstruction::ActiveLaneMask: + O << "active lane mask"; + break; + default: O << Instruction::getOpcodeName(getOpcode()); } Index: llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll @@ -45,9 +45,12 @@ define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: prefer_folding( ; PREFER-FOLDING: vector.body: -; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 -; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 -; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 +; PREFER-FOLDING: %induction = add <4 x i32> %broadcast.splat, +; PREFER-FOLDING: %[[ELEM0:.*]] = extractelement <4 x i32> %induction, i32 0 +; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ELEM0]], i32 430) +; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, +; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, +; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body ; ; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( @@ -507,9 +510,10 @@ define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: float( ; PREFER-FOLDING: vector.body: -; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32 -; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32 -; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32 +; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %2, i32 430) +; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask +; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask +; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -15,9 +15,12 @@ define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { ; COMMON-LABEL: @sgt_loopguard( ; COMMON: vector.body: -; CHECK-TF: masked.load -; CHECK-TF: masked.load -; CHECK-TF: masked.store + +; CHECK-TF: %[[ELEM:.*]] = extractelement <16 x i32> %vec.iv, i32 0 +; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[ELEM]], i32 %trip.count.minus{{.*}}) +; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask +; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask +; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask) entry: %cmp5 = icmp sgt i32 %N, 0 br i1 %cmp5, label %while.body.preheader, label %while.end Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll @@ -41,11 +41,14 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { ; COMMON-LABEL: tail_folding_enabled( ; COMMON: vector.body: -; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( -; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; COMMON: %[[INDUCTION:.*]] = add <4 x i64> %broadcast.splat, +; COMMON: %[[ELEM0:.*]] = extractelement <4 x i64> %[[INDUCTION]], i64 0 +; COMMON: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429) +; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask +; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask ; COMMON: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]] -; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]] -; COMMON: br i1 %12, label %{{.*}}, label %vector.body +; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask +; COMMON: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body @@ -75,13 +78,16 @@ ; PREDFLAG-LABEL: tail_folding_disabled( ; PREDFLAG: vector.body: -; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( -; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; PREDFLAG: %[[INDUCTION:induction.*]] = add <4 x i64> %broadcast.splat, +; PREDFLAG: %[[ELEM0:.*]] = extractelement <4 x i64> %[[INDUCTION]], i64 0 +; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429) +; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask +; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load -; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32( +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREDFLAG: %index.next = add i64 %index, 4 -; PREDFLAG: %12 = icmp eq i64 %index.next, 432 -; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6 +; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432 +; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %for.body