Index: llvm/include/llvm/Analysis/VectorUtils.h =================================================================== --- llvm/include/llvm/Analysis/VectorUtils.h +++ llvm/include/llvm/Analysis/VectorUtils.h @@ -125,6 +125,26 @@ std::string ScalarName; /// Scalar Function Name. std::string VectorName; /// Vector Function Name associated to this VFInfo. VFISAKind ISA; /// Instruction Set Architecture. + + unsigned getParamIndexForMask() const { + auto MaskPos = getParamIndexForOptionalMask(); + if (MaskPos) + return *MaskPos; + + llvm_unreachable("Requested paramater index of non-existent mask!"); + } + + bool isMasked() const { return getParamIndexForOptionalMask().has_value(); } + +private: + Optional getParamIndexForOptionalMask() const { + unsigned ParamCount = Shape.Parameters.size(); + for (unsigned i = 0; i < ParamCount; ++i) + if (Shape.Parameters[i].ParamKind == VFParamKind::GlobalPredicate) + return i; + + return None; + } }; namespace VFABI { Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -477,11 +477,6 @@ /// complex control flow around the loops. virtual std::pair createVectorizedLoopSkeleton(); - /// Widen a single call instruction within the innermost loop. - void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands, - VPTransformState &State, - Intrinsic::ID VectorIntrinsicID); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -3461,15 +3456,35 @@ // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. NeedToScalarize = true; + InstructionCost MaskCost = 0; VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we want an unmasked vector function but can't find one matching the VF, + // and the target supports an active lane mask, maybe we can find vector + // function that does use a mask and synthesize an all-true mask. + if (!VecFunc && TTI.emitGetActiveLaneMask() != PredicationStyle::None) { + Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); + VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we found one, add in the cost of creating a mask + // It would be nicer if we added the mask as an operand to the recipe since + // we could potentially share a mask across multiple calls, but we might + // have a non-masked variant at one VF and a masked variant at a different + // VF, so we couldn't construct a shared recipe. + if (VecFunc) + MaskCost = TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get( + IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), + VF)); + } if (!TLI || CI->isNoBuiltin() || !VecFunc) return Cost; // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput) + + MaskCost; if (VectorCallCost < Cost) { NeedToScalarize = false; Cost = VectorCallCost; Index: llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -440,11 +440,6 @@ "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); - SmallVector Tys; - for (Value *ArgOperand : CI.args()) - Tys.push_back( - ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue())); - for (unsigned Part = 0; Part < State.UF; ++Part) { SmallVector TysForDecl = {CI.getType()}; SmallVector Args; @@ -463,6 +458,8 @@ } Function *VectorF; + bool VectorFTakesMask = false; + unsigned VectorFMaskPos = 0; if (VectorIntrinsicID != Intrinsic::not_intrinsic) { // Use vector version of the intrinsic. if (State.VF.isVector()) @@ -473,13 +470,39 @@ assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. - const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/); + VFShape Shape = VFShape::get(CI, State.VF, /*HasGlobalPred=*/false); + + VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + + // TODO: Do we need TTI checks for masking here? Or can we + // assume it works by this point? Maybe add to the recipe... + if (!VectorF) { + Shape = VFShape::get(CI, State.VF, /*HasGlobalPred=*/true); + VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + } #ifndef NDEBUG - assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && - "Can't create vector function."); + assert(VectorF != nullptr && "Can't create vector function."); #endif - VectorF = VFDatabase(CI).getVectorizedFunction(Shape); + // Check the VFInfo for masking details + for (VFInfo Info : VFDatabase(CI).getMappings(CI)) { + if (Info.Shape == Shape) { + VectorFTakesMask = Info.isMasked(); + if (VectorFTakesMask) + VectorFMaskPos = Info.getParamIndexForMask(); + break; + } + } } + + // If the function takes a mask parameter, we need to synthesize one + // that's true for all lanes. + if (VectorFTakesMask) { + Value *Mask = ConstantInt::getTrue(VectorType::get( + IntegerType::getInt1Ty(VectorF->getFunctionType()->getContext()), + State.VF)); + Args.insert(Args.begin() + VectorFMaskPos, Mask); + } + SmallVector OpBundles; CI.getOperandBundlesAsDefs(OpBundles); CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); Index: llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -10,36 +10,41 @@ define void @test_widen(i64* noalias %a, i64* readnone %b) #4 { ; TFNONE-LABEL: @test_widen( ; TFNONE-NEXT: entry: -; TFNONE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFNONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFNONE: vector.ph: +; TFNONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFNONE-NEXT: [[TMP0:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* -; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; TFNONE-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR2:[0-9]+]] -; TFNONE-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; TFNONE-NEXT: [[TMP5:%.*]] = call i64 @foo(i64 [[TMP4]]) #[[ATTR2]] -; TFNONE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; TFNONE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP5]], i32 1 -; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] -; TFNONE-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <2 x i64>* -; TFNONE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* [[TMP9]], align 4 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TFNONE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; TFNONE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFNONE-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to * +; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; TFNONE-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; TFNONE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] +; TFNONE-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to * +; TFNONE-NEXT: store [[TMP6]], * [[TMP8]], align 4 +; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFNONE: middle.block: -; TFNONE-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; TFNONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; TFNONE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFNONE: scalar.ph: -; TFNONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TFNONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TFNONE-NEXT: br label [[FOR_BODY:%.*]] ; TFNONE: for.body: ; TFNONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFNONE-NEXT: [[GEP:%.*]] = getelementptr i64, i64* [[B]], i64 [[INDVARS_IV]] ; TFNONE-NEXT: [[LOAD:%.*]] = load i64, i64* [[GEP]], align 4 -; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2]] +; TFNONE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] ; TFNONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDVARS_IV]] ; TFNONE-NEXT: store i64 [[CALL]], i64* [[ARRAYIDX]], align 4 ; TFNONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -66,36 +71,41 @@ ; ; TFFALLBACK-LABEL: @test_widen( ; TFFALLBACK-NEXT: entry: -; TFFALLBACK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFFALLBACK: vector.ph: +; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[TMP0:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* -; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 -; TFFALLBACK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR2:[0-9]+]] -; TFFALLBACK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; TFFALLBACK-NEXT: [[TMP5:%.*]] = call i64 @foo(i64 [[TMP4]]) #[[ATTR2]] -; TFFALLBACK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; TFFALLBACK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP5]], i32 1 -; TFFALLBACK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <2 x i64>* -; TFFALLBACK-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* [[TMP9]], align 4 -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TFFALLBACK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; TFFALLBACK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to * +; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to * +; TFFALLBACK-NEXT: store [[TMP6]], * [[TMP8]], align 4 +; TFFALLBACK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; TFFALLBACK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFFALLBACK: middle.block: -; TFFALLBACK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFFALLBACK: scalar.ph: -; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]] ; TFFALLBACK: for.body: ; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, i64* [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, i64* [[GEP]], align 4 -; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2]] +; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] ; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: store i64 [[CALL]], i64* [[ARRAYIDX]], align 4 ; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1