Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -664,6 +664,12 @@ bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; /// Return true if the target supports masked gather. bool isLegalMaskedGather(Type *DataType, Align Alignment) const; + /// Return true if the target forces scalarizing of llvm.masked.gather + /// intrinsics. + bool forceScalarizeMaskedGather(Type *Type, Align Alignment) const; + /// Return true if the target forces scalarizing of llvm.masked.scatter + /// intrinsics. + bool forceScalarizeMaskedScatter(Type *Type, Align Alignment) const; /// Return true if the target supports masked compress store. bool isLegalMaskedCompressStore(Type *DataType) const; @@ -1543,6 +1549,8 @@ virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; virtual bool enableOrderedReductions() = 0; @@ -1945,6 +1953,12 @@ bool isLegalMaskedGather(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedGather(DataType, Alignment); } + bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) override { + return Impl.forceScalarizeMaskedGather(DataType, Alignment); + } + bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) override { + return Impl.forceScalarizeMaskedScatter(DataType, Alignment); + } bool isLegalMaskedCompressStore(Type *DataType) override { return Impl.isLegalMaskedCompressStore(DataType); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -267,6 +267,14 @@ return false; } + bool forceScalarizeMaskedGather(Type *DataType, Align Alignment) const { + return false; + } + + bool forceScalarizeMaskedScatter(Type *DataType, Align Alignment) const { + return false; + } + bool isLegalMaskedCompressStore(Type *DataType) const { return false; } bool isLegalMaskedExpandLoad(Type *DataType) const { return false; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -408,6 +408,16 @@ return TTIImpl->isLegalMaskedScatter(DataType, Alignment); } +bool TargetTransformInfo::forceScalarizeMaskedGather(Type *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment); +} + +bool TargetTransformInfo::forceScalarizeMaskedScatter(Type *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment); +} + bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const { return TTIImpl->isLegalMaskedCompressStore(DataType); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -189,6 +189,18 @@ return isLegalMaskedLoad(DataTy, Alignment); } + bool forceScalarizeMaskedGather(Type *VTy, Align Alignment) { + // For MVE, we have a custom lowering pass that will already have custom + // legalised any gathers that we can to MVE intrinsics, and want to expand + // all the rest. The pass runs before the masked intrinsic lowering pass, so + // if we are here, we know we want to expand. + return true; + } + + bool forceScalarizeMaskedScatter(Type *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } + bool isLegalMaskedGather(Type *Ty, Align Alignment); bool isLegalMaskedScatter(Type *Ty, Align Alignment) { Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1116,18 +1116,6 @@ if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) return false; - // This method is called in 2 places: - // - from the vectorizer with a scalar type, in which case we need to get - // this as good as we can with the limited info we have (and rely on the cost - // model for the rest). - // - from the masked intrinsic lowering pass with the actual vector type. - // For MVE, we have a custom lowering pass that will already have custom - // legalised any gathers that we can to MVE intrinsics, and want to expand all - // the rest. The pass runs before the masked intrinsic lowering pass, so if we - // are here, we know we want to expand. - if (isa(Ty)) - return false; - unsigned EltWidth = Ty->getScalarSizeInBits(); return ((EltWidth == 32 && Alignment >= 4) || (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -226,6 +226,10 @@ bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); + bool forceScalarizeMaskedGather(Type *VTy, Align Alignment); + bool forceScalarizeMaskedScatter(Type *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } bool isLegalMaskedGather(Type *DataType, Align Alignment); bool isLegalMaskedScatter(Type *DataType, Align Alignment); bool isLegalMaskedExpandLoad(Type *DataType); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4976,9 +4976,11 @@ const Instruction *I = nullptr) { if (CostKind != TTI::TCK_RecipThroughput) { if ((Opcode == Instruction::Load && - isLegalMaskedGather(SrcVTy, Align(Alignment))) || + isLegalMaskedGather(SrcVTy, Align(Alignment)) && + !forceScalarizeMaskedGather(SrcVTy, Align(Alignment))) || (Opcode == Instruction::Store && - isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + isLegalMaskedScatter(SrcVTy, Align(Alignment)) && + !forceScalarizeMaskedScatter(SrcVTy, Align(Alignment)))) return 1; return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, Alignment, CostKind, I); @@ -4993,9 +4995,11 @@ unsigned AddressSpace = PtrTy->getAddressSpace(); if ((Opcode == Instruction::Load && - !isLegalMaskedGather(SrcVTy, Align(Alignment))) || + (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || + forceScalarizeMaskedGather(SrcVTy, Align(Alignment)))) || (Opcode == Instruction::Store && - !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || + forceScalarizeMaskedScatter(SrcVTy, Align(Alignment))))) return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); @@ -5118,35 +5122,21 @@ return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); } +bool X86TTIImpl::forceScalarizeMaskedGather(Type *VTy, Align Alignment) { + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend + // it to 8 elements, but zeroing upper bits of the mask vector will add more + // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: + // Check, maybe the gather/scatter instruction is better in the VariableMask + // case. + unsigned NumElts = cast(VTy)->getNumElements(); + return NumElts == 1 || + (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { if (!supportsGather()) return false; - - // This function is called now in two cases: from the Loop Vectorizer - // and from the Scalarizer. - // When the Loop Vectorizer asks about legality of the feature, - // the vectorization factor is not calculated yet. The Loop Vectorizer - // sends a scalar type and the decision is based on the width of the - // scalar element. - // Later on, the cost model will estimate usage this intrinsic based on - // the vector type. - // The Scalarizer asks again about legality. It sends a vector type. - // In this case we can reject non-power-of-2 vectors. - // We also reject single element vectors as the type legalizer can't - // scalarize it. - if (auto *DataVTy = dyn_cast(DataTy)) { - unsigned NumElts = DataVTy->getNumElements(); - if (NumElts == 1) - return false; - // Gather / Scatter for vector 2 is not profitable on KNL / SKX - // Vector-4 of gather/scatter instruction does not exist on KNL. - // We can extend it to 8 elements, but zeroing upper bits of - // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter - // instruction is better in the VariableMask case. - if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) - return false; - } Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) return true; Index: llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp =================================================================== --- llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -959,7 +959,8 @@ Type *LoadTy = CI->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, LoadTy->getScalarType()); - if (TTI.isLegalMaskedGather(LoadTy, Alignment)) + if (TTI.isLegalMaskedGather(LoadTy, Alignment) && + !TTI.forceScalarizeMaskedGather(LoadTy, Alignment)) return false; scalarizeMaskedGather(DL, CI, DTU, ModifiedDT); return true; @@ -970,7 +971,8 @@ Type *StoreTy = CI->getArgOperand(0)->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, StoreTy->getScalarType()); - if (TTI.isLegalMaskedScatter(StoreTy, Alignment)) + if (TTI.isLegalMaskedScatter(StoreTy, Alignment) && + !TTI.forceScalarizeMaskedScatter(StoreTy, Alignment)) return false; scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT); return true; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1577,13 +1577,16 @@ /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool isLegalGatherOrScatter(Value *V, + ElementCount VF = ElementCount::getFixed(1)) { bool LI = isa(V); bool SI = isa(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); + if (VF.isVector()) + Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } @@ -1602,12 +1605,13 @@ /// instructions that may divide by zero. /// If a non-zero VF has been calculated, we check if I will be scalarized /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; + bool isScalarWithPredication(Instruction *I, ElementCount VF) const; // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + bool isPredicatedInst(Instruction *I, ElementCount VF, + bool IsKnownUniform = false) { // When we know the load is uniform and the original scalar loop was not // predicated we don't need to mark it as a predicated instruction. Any // vectorised blocks created when tail-folding are something artificial we @@ -1623,7 +1627,7 @@ // instructions. if (isa(I) || isa(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -1813,7 +1817,7 @@ /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I); + bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -4974,7 +4978,8 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, ElementCount VF) const { if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; switch(I->getOpcode()) { @@ -4986,11 +4991,14 @@ return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); + Type *VTy = Ty; + if (VF.isVector()) + VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - TTI.isLegalMaskedGather(Ty, Alignment)) + TTI.isLegalMaskedGather(VTy, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - TTI.isLegalMaskedScatter(Ty, Alignment)); + TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -5063,7 +5071,7 @@ // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5114,7 +5122,7 @@ << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -6530,7 +6538,8 @@ return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, + ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6539,8 +6548,7 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6567,13 +6575,13 @@ if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && + if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. @@ -6609,7 +6617,7 @@ // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6656,7 +6664,7 @@ // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -6819,7 +6827,7 @@ // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(I, VF)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6830,7 +6838,7 @@ /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); - if (useEmulatedMaskMemRefHack(I)) + if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -7218,7 +7226,7 @@ // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa(&I) && isScalarWithPredication(&I)) + if (isa(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7228,7 +7236,7 @@ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + isLegalGatherOrScatter(&I, VF)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7270,7 +7278,7 @@ } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); @@ -7473,7 +7481,7 @@ // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, VF)) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -8671,7 +8679,9 @@ VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -8711,7 +8721,8 @@ // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8785,7 +8796,7 @@ Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, Range); // Even if the instruction is not marked as uniform, there are certain Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3373,7 +3373,9 @@ CommonAlignment = commonAlignment(CommonAlignment, cast(V)->getAlign()); if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) + CommonAlignment) && + !TTI.forceScalarizeMaskedGather( + FixedVectorType::get(ScalarTy, VL.size()), CommonAlignment)) return LoadsState::ScatterVectorize; } Index: llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -83,7 +83,7 @@ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0 } -attributes #0 = { "target-features"="+neon,+sve" } +attributes #0 = { "target-features"="+neon,+sve" vscale_range(2, 0) } !0 = distinct !{!0, !1, !2, !3, !4} !1 = !{!"llvm.loop.mustprogress"} Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -90,4 +90,4 @@ ret void } -attributes #0 = { "target-features"="+neon,+sve,+v8.1a" } +attributes #0 = { "target-features"="+neon,+sve,+v8.1a" vscale_range(2, 0) } Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -125,7 +125,7 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 } -attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"} +attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve" vscale_range(2,0) } !0 = distinct !{!0, !1, !2, !3, !4, !5}