diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -125,6 +125,26 @@ std::string ScalarName; /// Scalar Function Name. std::string VectorName; /// Vector Function Name associated to this VFInfo. VFISAKind ISA; /// Instruction Set Architecture. + + unsigned getParamIndexForMask() const { + auto MaskPos = getParamIndexForOptionalMask(); + if (MaskPos) + return *MaskPos; + + llvm_unreachable("Requested paramater index of non-existent mask!"); + } + + bool isMasked() const { return getParamIndexForOptionalMask().has_value(); } + +private: + Optional getParamIndexForOptionalMask() const { + unsigned ParamCount = Shape.Parameters.size(); + for (unsigned i = 0; i < ParamCount; ++i) + if (Shape.Parameters[i].ParamKind == VFParamKind::GlobalPredicate) + return i; + + return None; + } }; namespace VFABI { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1107,6 +1107,21 @@ if (isa(&I)) continue; + if (CallInst *CI = dyn_cast(&I)) { + // Check whether we have at least one masked vector version of a scalar + // function. + bool HasMaskedVersion = false; + + auto Mappings = VFDatabase::getMappings(*CI); + for (VFInfo Info : Mappings) + HasMaskedVersion |= Info.isMasked(); + + if (HasMaskedVersion) { + MaskedOp.insert(CI); + continue; + } + } + // Loads are handled via masking (or speculated if safe to do so.) if (auto *LI = dyn_cast(&I)) { if (!SafePtrs.count(LI->getPointerOperand())) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -475,7 +475,7 @@ /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands, - VPTransformState &State); + VPTransformState &State, bool MaskAvailable); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -1533,6 +1533,7 @@ /// scalarized - /// i.e. either vector version isn't available, or is too expensive. InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, + bool NeedsMask, bool &NeedToScalarize) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than @@ -3401,6 +3402,7 @@ InstructionCost LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, + bool NeedsMask, bool &NeedToScalarize) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); @@ -3432,8 +3434,16 @@ // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(*CI, VF, NeedsMask); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we want an unmasked vector function but can't find one matching the VF, + // and the target supports an active lane mask, maybe we can find vector + // function that does use a mask and synthesize an all-true mask. + if (!VecFunc && !NeedsMask && + TTI.emitGetActiveLaneMask() != PredicationStyle::None) { + Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); + VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + } if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; @@ -4155,22 +4165,20 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands, - VPTransformState &State) { + VPTransformState &State, + bool MaskAvailable) { assert(!isa(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); - SmallVector Tys; - for (Value *ArgOperand : CI.args()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI); // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize); + InstructionCost CallCost = + Cost->getVectorCallCost(&CI, VF, MaskAvailable, NeedToScalarize); InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0; bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; @@ -4179,6 +4187,13 @@ assert((IntrinsicCost.isValid() || CallCost.isValid()) && "Either the intrinsic cost or vector call cost must be valid"); + // If we added a mask operand in the recipe, extract it so that we can + // insert it in the right position for the vectorized call. The mask isn't + // guaranteed to be the last argument. + VPValue *VPMask = nullptr; + if (MaskAvailable) + VPMask = ArgOperands.removeAndReturnLastOperand(); + for (unsigned Part = 0; Part < UF; ++Part) { SmallVector TysForDecl = {CI.getType()}; SmallVector Args; @@ -4197,6 +4212,9 @@ } Function *VectorF; + bool VectorFTakesMask = false; + unsigned VectorFMaskPos = 0; + if (UseVectorIntrinsic) { // Use vector version of the intrinsic. if (VF.isVector()) @@ -4206,22 +4224,52 @@ assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(CI, VF, MaskAvailable); + + VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + + if (!VectorF && !MaskAvailable && + TTI->emitGetActiveLaneMask() != PredicationStyle::None) { + Shape = VFShape::get(CI, VF, /*HasGlobalPred=*/true); + VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + } #ifndef NDEBUG - assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && - "Can't create vector function."); + assert(VectorF != nullptr && "Can't create vector function."); #endif - VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + // Check the VFInfo for masking details + for (VFInfo Info : VFDatabase(CI).getMappings(CI)) { + if (Info.Shape == Shape) { + VectorFTakesMask = Info.isMasked(); + if (VectorFTakesMask) + VectorFMaskPos = Info.getParamIndexForMask(); + break; + } + } + } + + assert((!MaskAvailable || VectorFTakesMask) && + "Mask supplied for function with no mask argument"); + + if (VectorFTakesMask) { + Value *Mask = nullptr; + if (VPMask) + Mask = State.get(VPMask, Part); + else + Mask = ConstantInt::getTrue(VectorType::get( + IntegerType::getInt1Ty(VectorF->getFunctionType()->getContext()), + VF)); + Args.insert(Args.begin() + VectorFMaskPos, Mask); } - SmallVector OpBundles; - CI.getOperandBundlesAsDefs(OpBundles); - CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); - if (isa(V)) - V->copyFastMathFlags(&CI); + SmallVector OpBundles; + CI.getOperandBundlesAsDefs(OpBundles); + CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); + + if (isa(V)) + V->copyFastMathFlags(&CI); - State.set(Def, V, Part); - State.addMetadata(V, &CI); + State.set(Def, V, Part); + State.addMetadata(V, &CI); } } @@ -7284,7 +7332,8 @@ return *RedCost; bool NeedToScalarize; CallInst *CI = cast(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = + getVectorCallCost(CI, VF, Legal->isMaskRequired(CI), NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); @@ -8270,7 +8319,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range) const { + VFRange &Range, + VPlanPtr &Plan) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { @@ -8281,6 +8331,10 @@ if (IsPredicated) return nullptr; + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(CI)) + Mask = createBlockInMask(CI->getParent(), Plan); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || @@ -8295,7 +8349,8 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize = false; - InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = CM.getVectorCallCost( + CI, VF, Legal->isMaskRequired(CI), NeedToScalarize); InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; @@ -8305,7 +8360,7 @@ return nullptr; ArrayRef Ops = Operands.take_front(CI->arg_size()); - return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), Mask); } bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { @@ -8564,7 +8619,7 @@ return nullptr; if (auto *CI = dyn_cast(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); if (isa(Instr) || isa(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); @@ -9273,7 +9328,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { State.ILV->widenCallInstruction(*cast(getUnderlyingInstr()), this, - *this, State); + *this, State, Mask); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -95,7 +95,7 @@ /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same /// decision from \p Range.Start to \p Range.End. VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range) const; + VFRange &Range, VPlanPtr &Plan); /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -946,12 +946,17 @@ /// A recipe for widening Call instructions. class VPWidenCallRecipe : public VPRecipeBase, public VPValue { + bool Mask; public: template - VPWidenCallRecipe(CallInst &I, iterator_range CallArguments) + VPWidenCallRecipe(CallInst &I, iterator_range CallArguments, + VPValue *MaskVal = nullptr) : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments), - VPValue(VPValue::VPVWidenCallSC, &I, this) {} + VPValue(VPValue::VPVWidenCallSC, &I, this), Mask(MaskVal != nullptr) { + if (MaskVal) + addOperand(MaskVal); + } ~VPWidenCallRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -270,6 +270,12 @@ Op->removeUser(*this); } + VPValue *removeAndReturnLastOperand() { + VPValue *Op = Operands.pop_back_val(); + Op->removeUser(*this); + return Op; + } + typedef SmallVectorImpl::iterator operand_iterator; typedef SmallVectorImpl::const_iterator const_operand_iterator; typedef iterator_range operand_range; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -8,10 +8,14 @@ ; primary mask, and that without tail folding we synthesize an all-true mask. define void @test_widen(i64* noalias %a, i64* readnone %b) #4 { ; CHECK-LABEL: @test_widen( -; LV-NOT: call @foo_vector -; TFALWAYS-NOT: vector.body -; TFALWAYS-NOT: call @foo_vector -; TFFALLBACK-NOT: call @foo_vector +; LV: %[[LOAD:.+]] = load +; LV: call @foo_vector( %[[LOAD]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; TFALWAYS: %[[MASK:.+]] = phi +; TFALWAYS: %[[LOAD:.+]] = call @llvm.masked.load.nxv2i64.p0nxv2i64 +; TFALWAYS: call @foo_vector( %[[LOAD]], %[[MASK]]) +; TFFALLBACK: %[[MASK:.+]] = phi +; TFFALLBACK: %[[LOAD:.+]] = call @llvm.masked.load.nxv2i64.p0nxv2i64 +; TFFALLBACK: call @foo_vector( %[[LOAD]], %[[MASK]]) ; CHECK: ret void entry: br label %for.body @@ -34,9 +38,9 @@ ; Check that a simple conditional call can be vectorized. define void @test_if_then(i64* noalias %a, i64* readnone %b) #4 { ; CHECK-LABEL: @test_if_then( -; LV-NOT: call @foo_vector -; TFALWAYS-NOT: call @foo_vector -; TFFALLBACK-NOT: call @foo_vector +; LV: call @foo_vector( %wide.load, %{{.+}}) +; TFALWAYS: call @foo_vector( %wide.masked.load, %{{.+}}) +; TFFALLBACK: call @foo_vector( %wide.masked.load, %{{.+}}) ; CHECK: ret void entry: br label %for.body @@ -70,12 +74,31 @@ ; we just see a splat of the parameter instead. More work needed. define void @test_widen_if_then_else(i64* noalias %a, i64* readnone %b) #4 { ; CHECK-LABEL: @test_widen_if_then_else -; LV-NOT: call @foo_vector -; LV-NOT: call @foo_uniform -; TFALWAYS-NOT: call @foo_vector -; TFALWAYS-NOT: call @foo_uniform -; TFFALLBACK-NOT: call @foo_vector -; TFFALLBACK-NOT: call @foo_uniform +; LV: %[[LOAD:.+]] = load , * %{{[0-9]+}} +; LV: %[[CMP:.+]] = icmp ugt %[[LOAD]], shufflevector ( insertelement ( poison, i64 50, i32 0), poison, zeroinitializer) +; LV: %[[INV:.+]] = xor %[[CMP]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; LV: %[[UNIFORM_SPLAT:.+]] = call @foo_vector( zeroinitializer, %[[INV]]) +; LV: %[[VECTOR:.+]] = call @foo_vector( %[[LOAD]], %[[CMP]]) +; LV: %[[PPHI:.+]] = select %[[INV]], %[[UNIFORM_SPLAT]], %[[VECTOR]] +; LV: store %[[PPHI]] +; TFALWAYS: %[[LOAD:.+]] = call @llvm.masked.load.nxv2i64.p0nxv2i64(* %{{.+}}, i32 8, %active.lane.mask, poison) +; TFALWAYS: %[[CMP:.+]] = icmp ugt %[[LOAD]], shufflevector ( insertelement ( poison, i64 50, i32 0), poison, zeroinitializer) +; TFALWAYS: %[[INV:.+]] = xor %[[CMP]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; TFALWAYS: %[[MERGE1:.+]] = select %active.lane.mask, %[[INV]], zeroinitializer +; TFALWAYS: %[[UNIFORM_SPLAT:.+]] = call @foo_vector( zeroinitializer, %[[MERGE1]]) +; TFALWAYS: %[[MERGE2:.+]] = select %active.lane.mask, %[[CMP]], zeroinitializer +; TFALWAYS: %[[VECTOR:.+]] = call @foo_vector( %[[LOAD]], %[[MERGE2]]) +; TFALWAYS: %[[PPHI:.+]] = select %[[MERGE1]], %[[UNIFORM_SPLAT]], %[[VECTOR]] +; TFALWAYS: call void @llvm.masked.store.nxv2i64.p0nxv2i64( %[[PPHI]] +; TFFALLBACK: %[[LOAD:.+]] = call @llvm.masked.load.nxv2i64.p0nxv2i64(* %{{.+}}, i32 8, %active.lane.mask, poison) +; TFFALLBACK: %[[CMP:.+]] = icmp ugt %[[LOAD]], shufflevector ( insertelement ( poison, i64 50, i32 0), poison, zeroinitializer) +; TFFALLBACK: %[[INV:.+]] = xor %[[CMP]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; TFFALLBACK: %[[MERGE1:.+]] = select %active.lane.mask, %[[INV]], zeroinitializer +; TFFALLBACK: %[[UNIFORM_SPLAT:.+]] = call @foo_vector( zeroinitializer, %[[MERGE1]]) +; TFFALLBACK: %[[MERGE2:.+]] = select %active.lane.mask, %[[CMP]], zeroinitializer +; TFFALLBACK: %[[VECTOR:.+]] = call @foo_vector( %[[LOAD]], %[[MERGE2]]) +; TFFALLBACK: %[[PPHI:.+]] = select %[[MERGE1]], %[[UNIFORM_SPLAT]], %[[VECTOR]] +; TFFALLBACK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( %[[PPHI]] ; CHECK: ret void entry: br label %for.body @@ -112,10 +135,9 @@ ; unpredicated body with scalar tail can use the unmasked variant. define void @test_widen_nomask(i64* noalias %a, i64* readnone %b) #4 { ; CHECK-LABEL: @test_widen_nomask( -; LV: call @foo_vector_nomask -; TFALWAYS-NOT: vector.body +; LV: call @foo_vector_nomask( %wide.load) ; TFALWAYS-NOT: call @foo_vector_nomask -; TFFALLBACK: call @foo_vector_nomask +; TFFALLBACK: call @foo_vector_nomask( %wide.load) ; CHECK: ret void entry: br label %for.body @@ -140,10 +162,9 @@ ; version. define void @test_widen_optmask(i64* noalias %a, i64* readnone %b) #4 { ; CHECK-LABEL: @test_widen_optmask( -; LV: call @foo_vector_nomask -; TFALWAYS-NOT: vector.body -; TFALWAYS-NOT: call @foo_vector -; TFFALLBACK: call @foo_vector_nomask +; LV: call @foo_vector_nomask( %wide.load) +; TFALWAYS: call @foo_vector( %wide.masked.load, %active.lane.mask) +; TFFALLBACK: call @foo_vector( %wide.masked.load, %active.lane.mask) ; CHECK: ret void entry: br label %for.body