diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1669,6 +1669,11 @@ /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; + InstructionCost getCompactCost() const; + bool isTargetSupportedCompactStore() const; + unsigned getTargetSupportedCompact() const; + unsigned getTargetSupportedCNTP() const; + /// @} private: @@ -2035,6 +2040,10 @@ getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual bool isTargetSupportedCompactStore() const = 0; + virtual unsigned getTargetSupportedCompact() const = 0; + virtual unsigned getTargetSupportedCNTP() const = 0; + virtual InstructionCost getCompactCost() const = 0; }; template @@ -2745,6 +2754,22 @@ unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + bool isTargetSupportedCompactStore() const override { + return Impl.isTargetSupportedCompactStore(); + } + + unsigned getTargetSupportedCompact() const override { + return Impl.getTargetSupportedCompact(); + } + + unsigned getTargetSupportedCNTP() const override { + return Impl.getTargetSupportedCNTP(); + } + + InstructionCost getCompactCost() const override { + return Impl.getCompactCost(); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -891,6 +891,13 @@ unsigned getMaxNumArgs() const { return UINT_MAX; } + bool isTargetSupportedCompactStore() const { return false; } + unsigned getTargetSupportedCompact() const { return 0; } + unsigned getTargetSupportedCNTP() const { return 0; } + InstructionCost getCompactCost() const { + return InstructionCost::getInvalid(); + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -700,6 +700,10 @@ return getST()->getMaxPrefetchIterationsAhead(); } + virtual InstructionCost getCompactCost() const { + return InstructionCost::getInvalid(); + } + virtual bool enableWritePrefetching() const { return getST()->enableWritePrefetching(); } diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -412,6 +412,14 @@ const RecurrenceDescriptor &Desc, Value *Src, PHINode *OrigPhi = nullptr); +Value *createTargetCompact(IRBuilderBase &B, Module *M, + const TargetTransformInfo *TTI, Value *Mask, + Value *Val); + +Value *createTargetCNTP(IRBuilderBase &B, Module *M, + const TargetTransformInfo *TTI, Value *Mask, + Value *Val); + /// Create an ordered reduction intrinsic using the given recurrence /// descriptor \p Desc. Value *createOrderedReduction(IRBuilderBase &B, diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -224,6 +224,26 @@ Instruction *ExactFPMathInst = nullptr; }; +class CompactDescriptor { + PHINode *LiveOutPhi; + bool IsCompactSign; + SmallPtrSet Chain; + +public: + CompactDescriptor() = default; + CompactDescriptor(SmallPtrSetImpl &CompactChain, PHINode *LiveOut, + bool IsSign) + : LiveOutPhi(LiveOut), IsCompactSign(IsSign) { + Chain.insert(CompactChain.begin(), CompactChain.end()); + } + + bool isInCompactChain(Value *V) const { return Chain.find(V) != Chain.end(); } + + PHINode *getLiveOutPhi() const { return LiveOutPhi; } + + bool isSign() const { return IsCompactSign; } +}; + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -261,6 +281,8 @@ /// inductions and reductions. using RecurrenceSet = SmallPtrSet; + using CompactList = MapVector; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -397,6 +419,14 @@ DominatorTree *getDominatorTree() const { return DT; } + const CompactList &getCompactList() const { return CpList; } + + bool hasCompactChain() const { return CpList.size() > 0; } + + PHINode *getCompactChainStart(Instruction *I) const; + + bool isSign(PHINode *Phi) { return CpList[Phi].isSign(); }; + private: /// Return true if the pre-header, exiting and latch blocks of \p Lp and all /// its nested loops are considered legal for vectorization. These legal @@ -425,6 +455,8 @@ /// and we only need to check individual instructions. bool canVectorizeInstrs(); + bool isMatchCompact(PHINode *Phi, Loop *TheLoop, CompactDescriptor &CpDesc); + /// When we vectorize loops we may change the order in which /// we read and write from memory. This method checks if it is /// legal to vectorize the code, considering only memory constrains. @@ -538,6 +570,9 @@ /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; + + // Record compact chain in the loop. + CompactList CpList; }; } // namespace llvm diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1237,6 +1237,22 @@ return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +bool TargetTransformInfo::isTargetSupportedCompactStore() const { + return TTIImpl->isTargetSupportedCompactStore(); +} + +unsigned TargetTransformInfo::getTargetSupportedCompact() const { + return TTIImpl->getTargetSupportedCompact(); +} + +unsigned TargetTransformInfo::getTargetSupportedCNTP() const { + return TTIImpl->getTargetSupportedCNTP(); +} + +InstructionCost TargetTransformInfo::getCompactCost() const { + return TTIImpl->getCompactCost(); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" @@ -301,6 +302,11 @@ case ISD::FFREXP: Res = PromoteIntRes_FFREXP(N); break; + case ISD::INTRINSIC_WO_CHAIN: + if (N->getConstantOperandVal(0) == Intrinsic::aarch64_sve_compact) { + Res = PromoteIntRes_COMPACT(N); + break; + } } // If the result is null then the sub-method took care of registering it. @@ -5957,6 +5963,12 @@ return DAG.getBuildVector(N->getValueType(0), dl, NewOps); } +SDValue DAGTypeLegalizer::PromoteIntRes_COMPACT(SDNode *N) { + SDValue OpExt = SExtOrZExtPromotedInteger(N->getOperand(2)); + return DAG.getNode(N->getOpcode(), SDLoc(N), OpExt.getValueType(), + N->getOperand(0), N->getOperand(1), OpExt); +} + SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) { assert(OpNo > 1); SDValue Op = N->getOperand(OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -364,6 +364,7 @@ SDValue PromoteIntRes_FunnelShift(SDNode *N); SDValue PromoteIntRes_VPFunnelShift(SDNode *N); SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); + SDValue PromoteIntRes_COMPACT(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -24,8 +24,8 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include -#include namespace llvm { @@ -410,6 +410,15 @@ return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); } + + bool isTargetSupportedCompactStore() const { return ST->hasSVE(); } + unsigned getTargetSupportedCompact() const { + return Intrinsic::aarch64_sve_compact; + } + unsigned getTargetSupportedCNTP() const { + return Intrinsic::aarch64_sve_cntp; + } + InstructionCost getCompactCost() const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3816,3 +3816,5 @@ return AM.Scale != 0 && AM.Scale != 1; return -1; } + +InstructionCost AArch64TTIImpl::getCompactCost() const { return 6; } diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" @@ -1123,6 +1124,34 @@ return createSimpleTargetReduction(B, TTI, Src, RK); } +Value *llvm::createTargetCompact(IRBuilderBase &B, Module *M, + const TargetTransformInfo *TTI, Value *Mask, + Value *Val) { + Intrinsic::ID IID = TTI->getTargetSupportedCompact(); + switch (IID) { + default: + return nullptr; + case Intrinsic::aarch64_sve_compact: + Function *CompactMaskDecl = Intrinsic::getDeclaration( + M, Intrinsic::aarch64_sve_compact, Val->getType()); + return B.CreateCall(CompactMaskDecl, {Mask, Val}); + } +} + +Value *llvm::createTargetCNTP(IRBuilderBase &B, Module *M, + const TargetTransformInfo *TTI, Value *Mask, + Value *Val) { + Intrinsic::ID IID = TTI->getTargetSupportedCNTP(); + switch (IID) { + default: + return nullptr; + case Intrinsic::aarch64_sve_cntp: + Function *CNTPDecl = Intrinsic::getDeclaration( + M, Intrinsic::aarch64_sve_cntp, Val->getType()); + return B.CreateCall(CNTPDecl, {Mask, Val}); + } +} + Value *llvm::createOrderedReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc, Value *Src, Value *Start) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" @@ -78,6 +79,11 @@ "Scalable vectorization is available and favored when the " "cost is inconclusive."))); +static cl::opt + EnableCompactVectorization("enable-compact-vectorization", cl::init(true), + cl::Hidden, + cl::desc("Enable vectorizing compact pattern.")); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -785,6 +791,143 @@ return Scalarize; } +static bool isUserOfCompactPHI(BasicBlock *BB, PHINode *Phi, Instruction *I) { + if (I->getParent() != BB) + return false; + + // Operations on PHI should be affine. + if (I->getOpcode() != Instruction::Add && + I->getOpcode() != Instruction::Sub && + I->getOpcode() != Instruction::SExt && + I->getOpcode() != Instruction::ZExt) + return false; + + if (I == Phi) + return true; + + for (unsigned i = 0; i < I->getNumOperands(); i++) { + if (auto *Instr = dyn_cast(I->getOperand(i))) + if (isUserOfCompactPHI(BB, Phi, Instr)) + return true; + } + return false; +} + +// Match the basic compact pattern: +// for.body: +// %src.phi = phi i64 [ 0, %preheader ], [ %target.phi, %for.inc ] +// ... +// if.then: +// ... +// %data = load i32, ptr %In +// (there may be additional sext/zext if %src.phi types i32) +// %addr = getelementptr i32, ptr %Out, i64 %src.phi +// store i32 %data, ptr %addr +// %inc = add i64 %src.phi, 1 +// for.inc +// %target.phi = phi i64 [ %inc, if.then ], [ %src.phi, %for.body ] +bool LoopVectorizationLegality::isMatchCompact(PHINode *Phi, Loop *TheLoop, + CompactDescriptor &CpDesc) { + if (Phi->getNumIncomingValues() > 2) + return false; + + // Don't support phis who is used as mask. + for (User *U : Phi->users()) { + if (isa(U)) + return false; + } + + SmallPtrSet CompactChain; + CompactChain.insert(Phi); + + BasicBlock *LoopPreHeader = TheLoop->getLoopPreheader(); + int ExitIndex = Phi->getIncomingBlock(0) == LoopPreHeader ? 1 : 0; + BasicBlock *ExitBlock = Phi->getIncomingBlock(ExitIndex); + PHINode *CompactLiveOut = nullptr; + Value *IncValue = nullptr; + BasicBlock *IncBlock = nullptr; + bool IsCycle = false; + for (auto &CandPhi : ExitBlock->phis()) { + if (llvm::is_contained(CandPhi.incoming_values(), Phi) && + CandPhi.getNumIncomingValues() == 2) { + IsCycle = true; + CompactLiveOut = &CandPhi; + int IncIndex = CandPhi.getIncomingBlock(0) == Phi->getParent() ? 1 : 0; + IncBlock = CandPhi.getIncomingBlock(IncIndex); + IncValue = CandPhi.getIncomingValueForBlock(IncBlock); + break; + } + } + // Similar with reduction PHI. + if (!IsCycle) + return false; + CompactChain.insert(CompactLiveOut); + + // Match the pattern %inc = add i32 %src.phi, 1. + Value *Index = nullptr, *Step = nullptr; + if (!match(IncValue, m_Add(m_Value(Index), m_Value(Step)))) + return false; + if (Index != Phi) { + std::swap(Index, Step); + } + if (Step != ConstantInt::get(Step->getType(), 1)) + return false; + CompactChain.insert(IncValue); + + const DataLayout &DL = Phi->getModule()->getDataLayout(); + int CntCandStores = 0; + GetElementPtrInst *GEP = nullptr; + for (auto &Inst : *IncBlock) { + if (auto *SI = dyn_cast(&Inst)) { + // TODO: Support llvm.sve.compact.nxv8i16, llvm.sve.compact.nxv16i18 in + // the future. + unsigned TySize = DL.getTypeSizeInBits(SI->getValueOperand()->getType()); + if (TySize < 32) + return false; + + GEP = dyn_cast(SI->getPointerOperand()); + if (GEP == nullptr) + continue; + + // Only handle single pointer. + if (GEP->getNumOperands() != 2) + continue; + + // Get the index of GEP, index could be phi or sext/zext (if phi types + // i32). + Value *Op1 = GEP->getOperand(1); + Value *X = nullptr; + SmallSet CandiInstrs; + if (match(Op1, m_SExt(m_Value(X))) || match(Op1, m_ZExt(m_Value(X)))) { + Op1 = X; + } + Instruction *Op1Instr = dyn_cast(Op1); + if (!Op1Instr || isUserOfCompactPHI(IncBlock, Phi, Op1Instr)) + continue; + CompactChain.insert(GEP); + CompactChain.insert(SI); + CntCandStores++; + } + } + if (!CntCandStores) + return false; + + KnownBits Bits = computeKnownBits(Phi, DL); + bool IsSign = !Bits.isNonNegative(); + CompactDescriptor CompactDesc(CompactChain, CompactLiveOut, IsSign); + CpDesc = CompactDesc; + LLVM_DEBUG(dbgs() << "LV: Found a compact chain.\n"); + return true; +} + +PHINode *LoopVectorizationLegality::getCompactChainStart(Instruction *I) const { + for (auto &CpDesc : CpList) { + if (CpDesc.second.isInCompactChain(I)) + return CpDesc.first; + } + return nullptr; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *Header = TheLoop->getHeader(); @@ -881,6 +1024,14 @@ continue; } + CompactDescriptor CpDesc; + if (EnableCompactVectorization && + TTI->isTargetSupportedCompactStore() && + isMatchCompact(Phi, TheLoop, CpDesc)) { + CpList[Phi] = CpDesc; + continue; + } + reportVectorizationFailure("Found an unidentified PHI", "value that could not be identified as " "reduction is used outside the loop", @@ -1525,16 +1676,22 @@ LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); SmallPtrSet ReductionLiveOuts; + SmallPtrSet CompactLiveOuts; for (const auto &Reduction : getReductionVars()) ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); + for (const auto &Compact : getCompactList()) + CompactLiveOuts.insert(Compact.second.getLiveOutPhi()); + // TODO: handle non-reduction outside users when tail is folded by masking. for (auto *AE : AllowedExit) { // Check that all users of allowed exit values are inside the loop or // are the live-out of a reduction. if (ReductionLiveOuts.count(AE)) continue; + if (CompactLiveOuts.count(AE)) + continue; for (User *U : AE->users()) { Instruction *UI = cast(U); if (TheLoop->contains(UI)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -619,6 +619,8 @@ /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); + void fixCompactPHI(VPCompactPHIRecipe *CompactPHIR, VPTransformState &State); + /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -1913,7 +1915,8 @@ /// there is no vector code generation, the check blocks are removed /// completely. void Create(Loop *L, const LoopAccessInfo &LAI, - const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC, + LoopVectorizationLegality *LVL = nullptr) { // Hard cutoff to limit compile-time increase in case a very large number of // runtime checks needs to be generated. @@ -1946,7 +1949,7 @@ "vector.memcheck"); auto DiffChecks = RtPtrChecking.getDiffChecks(); - if (DiffChecks) { + if (DiffChecks && !(LVL && LVL->hasCompactChain())) { Value *RuntimeVF = nullptr; MemRuntimeCheckCond = addDiffRuntimeChecks( MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, @@ -3604,6 +3607,47 @@ fixReduction(ReductionPhi, State); else if (auto *FOR = dyn_cast(&R)) fixFixedOrderRecurrence(FOR, State); + else if (auto *CompactR = dyn_cast(&R)) + fixCompactPHI(CompactR, State); + } +} + +void InnerLoopVectorizer::fixCompactPHI(VPCompactPHIRecipe *CompactPHIR, + VPTransformState &State) { + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstNonPHI()); + VPValue *VPBackEdgeValue = CompactPHIR->getBackedgeValue(); + Value *BackEdgeValue = State.get(VPBackEdgeValue, State.UF - 1); + Value *StartValue = CompactPHIR->getStartValue()->getUnderlyingValue(); + Value *TruncBackEdgeValue = BackEdgeValue; + if (StartValue->getType() != BackEdgeValue->getType()) + TruncBackEdgeValue = + Builder.CreateTruncOrBitCast(BackEdgeValue, StartValue->getType()); + + // Generate phi in scalar preheader to pass LiveIns outside the loop. + PHINode *ScalarPreheaderPN = + PHINode::Create(StartValue->getType(), 2, "compact.rdx", + LoopScalarPreHeader->getFirstNonPHI()); + + for (auto *Incoming : predecessors(LoopScalarPreHeader)) { + if (Incoming == LoopMiddleBlock) + ScalarPreheaderPN->addIncoming(TruncBackEdgeValue, Incoming); + else + ScalarPreheaderPN->addIncoming(StartValue, Incoming); + } + + Value *ScalarBackEdgeValue = + CompactPHIR->getBackedgeValue()->getUnderlyingValue(); + for (PHINode &Phi : LoopScalarBody->phis()) { + if (llvm::is_contained(Phi.incoming_values(), ScalarBackEdgeValue)) { + Phi.setIncomingValueForBlock(LoopScalarPreHeader, ScalarPreheaderPN); + } + } + + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (llvm::is_contained(LCSSAPhi.incoming_values(), ScalarBackEdgeValue)) { + LCSSAPhi.addIncoming(TruncBackEdgeValue, LoopMiddleBlock); + State.Plan->removeLiveOut(&LCSSAPhi); + } } } @@ -4223,6 +4267,8 @@ return !VFDatabase::hasMaskedVariant(*(cast(I)), VF); case Instruction::Load: case Instruction::Store: { + if (Legal->getCompactChainStart(I) != nullptr) + return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); Type *VTy = Ty; @@ -4584,6 +4630,12 @@ continue; } + // GEPs in compact chain should be uniform after vectorization. + if (isa(&I) && Legal->getCompactChainStart(&I)) { + addToWorklistIfAllowed(&I); + continue; + } + // If there's no pointer operand, there's nothing to do. auto *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) @@ -6801,6 +6853,24 @@ continue; } + if (isa(I) && Legal->hasCompactChain()) { + InstructionCost Cost = 0; + if (!VF.isScalable() || VF.isScalar()) { + setWideningDecision(&I, VF, CM_Widen, InstructionCost::getInvalid()); + continue; + } + Type *EleTy = getLoadStoreType(&I); + VectorType *VectorTy = cast(ToVectorTy(EleTy, VF)); + const Align Alignment = getLoadStoreAlignment(&I); + unsigned AS = getLoadStoreAddressSpace(&I); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Cost += TTI.getMaskedMemoryOpCost(I.getOpcode(), VectorTy, Alignment, + AS, CostKind); + Cost += TTI.getCompactCost(); + setWideningDecision(&I, VF, CM_Widen, Cost); + continue; + } + // Choose between Interleaving, Gather/Scatter or Scalarization. InstructionCost InterleaveCost = InstructionCost::getInvalid(); unsigned NumAccesses = 1; @@ -8451,6 +8521,30 @@ return toVPRecipeResult(Recipe); } +VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenCompactRecipe( + Instruction *Instr, ArrayRef Operands, VPlanPtr &Plan, + bool IsSign, const TargetTransformInfo *TTI) { + if (auto Phi = dyn_cast(Instr)) { + if (Instr->getParent() != OrigLoop->getHeader()) + return toVPRecipeResult(new VPWidenCompactInstructionRecipe( + Instr, Instr->getOpcode(), Operands)); + + VPValue *StartV = Operands[0]; + VPHeaderPHIRecipe *PhiRecipe = new VPCompactPHIRecipe(Phi, StartV, IsSign); + recordRecipeOf(cast( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + PhisToFix.push_back(PhiRecipe); + return toVPRecipeResult(PhiRecipe); + } + + if (isa(Instr)) + return nullptr; + + VPValue *Mask = createBlockInMask(Instr->getParent(), *Plan); + return toVPRecipeResult(new VPWidenCompactInstructionRecipe( + Instr, Instr->getOpcode(), Operands, Mask, TTI)); +} + VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, @@ -8542,6 +8636,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); + // Don't build vplan of fixed width version if there is a compact chain in the + // loop. + if (Legal->hasCompactChain() && !MinVF.isScalable()) + return; auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { @@ -8783,8 +8881,15 @@ Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) continue; - auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( - Instr, Operands, Range, VPBB, Plan); + VPRecipeOrVPValueTy RecipeOrValue; + if (PHINode *ChainStart = Legal->getCompactChainStart(Instr)) { + RecipeOrValue = RecipeBuilder.tryToCreateWidenCompactRecipe( + Instr, Operands, Plan, Legal->isSign(ChainStart), &TTI); + } else { + RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, + Range, VPBB, Plan); + } + if (!RecipeOrValue) RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); // If Instr can be simplified to an existing VPValue, use it. @@ -9924,7 +10029,8 @@ // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC, + &LVL); // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -123,6 +123,11 @@ VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan); + VPRecipeOrVPValueTy + tryToCreateWidenCompactRecipe(Instruction *Instr, + ArrayRef Operands, VPlanPtr &Plan, + bool IsSign, const TargetTransformInfo *TTI); + /// Set the recipe created for given ingredient. This operation is a no-op for /// ingredients that were not marked using a nullptr entry in the map. void setRecipe(Instruction *I, VPRecipeBase *R) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1246,6 +1246,40 @@ } }; +class VPWidenCompactInstructionRecipe : public VPRecipeBase, public VPValue { +private: + Instruction &Ingredient; + unsigned Opcode; + VPValue *Mask; + const TargetTransformInfo *TTI; + + void genCompactInc(VPTransformState &State); + void genCompactStore(VPTransformState &State); + void genCompactLiveOut(VPTransformState &State); + +public: + VPWidenCompactInstructionRecipe(Instruction *I, unsigned Opcode, + ArrayRef Operands, + VPValue *Mask = nullptr, + const TargetTransformInfo *TTI = nullptr) + : VPRecipeBase(VPDef::VPCompactInstructionSC, Operands), VPValue(this, I), + Ingredient(*I), Opcode(Opcode), Mask(Mask), TTI(TTI) {} + ~VPWidenCompactInstructionRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPCompactInstructionSC) + + unsigned getOpcode() const { return Opcode; } + + VPValue *getMask() { return Mask; } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue { bool isPointerLoopInvariant() const { @@ -1588,6 +1622,34 @@ bool isInLoop() const { return IsInLoop; } }; +class VPCompactPHIRecipe : public VPHeaderPHIRecipe { + PHINode *CompactPHI; + bool IsCompactSign; + +public: + VPCompactPHIRecipe(PHINode *Phi, VPValue *Start, bool IsSign) + : VPHeaderPHIRecipe(VPDef::VPCompactPHISC, Phi, Start), CompactPHI(Phi), + IsCompactSign(IsSign) {} + + ~VPCompactPHIRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPCompactPHISC) + + static inline bool classof(const VPHeaderPHIRecipe *R) { + return R->getVPDefID() == VPDef::VPCompactPHISC; + } + + bool isSign() { return IsCompactSign; } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPRecipeBase, public VPValue { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -830,6 +830,7 @@ // generated. bool SinglePartNeeded = isa(PhiR) || isa(PhiR) || + isa(PhiR) || (isa(PhiR) && cast(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; @@ -840,6 +841,22 @@ SinglePartNeeded ? State->UF - 1 : Part); cast(Phi)->addIncoming(Val, VectorLatchBB); } + + // Fix Compact phis if UF > 1. + if (isa(PhiR)) { + for (unsigned Part = 1; Part < State->UF; ++Part) { + Value *Val = State->get(PhiR->getBackedgeValue(), Part - 1); + // BOSCC vectorization will transform liveouts into phis, and we should + // get the underlying value here. + if (auto *PN = dyn_cast(Val)) { + int ValIdx = isa(PN->getOperand(0)) ? 1 : 0; + Val = PN->getOperand(ValIdx); + } + PHINode *Phi = cast(State->get(PhiR, Part)); + Phi->replaceAllUsesWith(Val); + Phi->eraseFromParent(); + } + } } // We do not attempt to preserve DT for outer loop vectorization currently. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -27,6 +27,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include @@ -933,6 +934,140 @@ VecInd->addIncoming(LastInduction, VectorPH); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenCompactInstructionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "COMPACT "; + if (getOpcode() != Instruction::Store) { + printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(getOpcode()) << " "; + printOperands(O, SlotTracker); +} +#endif + +void VPWidenCompactInstructionRecipe::execute(VPTransformState &State) { + switch (getOpcode()) { + case Instruction::Add: + genCompactInc(State); + break; + case Instruction::PHI: + genCompactLiveOut(State); + break; + case Instruction::Store: + genCompactStore(State); + break; + default: + llvm_unreachable("Unsupport opcode for compact."); + } +} + +void VPWidenCompactInstructionRecipe::genCompactStore(VPTransformState &State) { + assert(State.VF.isScalable() && "Compact store is for SVE scenario"); + auto &Builder = State.Builder; + VPValue *VPStoredValue = getOperand(0); + VPValue *VPAddr = getOperand(1); + StoreInst *SI = cast(&Ingredient); + Type *ScalarTy = getLoadStoreType(&Ingredient); + Module *M = SI->getModule(); + VectorType *MaskVTy = cast(State.get(getMask(), 0)->getType()); + Constant *One = nullptr; + unsigned VL = MaskVTy->getElementCount().getKnownMinValue(); + switch (VL) { + case 2: + One = ConstantInt::get(Type::getInt64Ty(M->getContext()), 1); + break; + case 4: + One = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1); + break; + default: + // TODO: Try to support compact.nxv8i16 / compact.nxv16i8 in the future. + llvm_unreachable("Unsupported type"); + } + Constant *VOne = ConstantVector::getSplat(MaskVTy->getElementCount(), One); + for (unsigned Part = 0; Part < State.UF; ++Part) { + // Generate compact mask. + Value *Mask = State.get(getMask(), Part); + Value *CompactMaskII = createTargetCompact(Builder, M, TTI, Mask, VOne); + assert(CompactMaskII && "Do not support compact in current target."); + Value *CompactCmpII = + Builder.CreateCmp(ICmpInst::ICMP_EQ, CompactMaskII, VOne); + + // Transform stored value into compact form. + VectorType *StoreVTy = VectorType::get(ScalarTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + Value *Addr = State.get(VPAddr, VPIteration(Part, 0)); + Value *StoredValue = State.get(VPStoredValue, Part); + Value *SCompact = createTargetCompact(Builder, M, TTI, Mask, StoredValue); + assert(SCompact && "Do not support comapct in current target."); + Instruction *CompactSI = + Builder.CreateMaskedStore(SCompact, Addr, Alignment, CompactCmpII); + State.addMetadata(CompactSI, SI); + } +} + +void VPWidenCompactInstructionRecipe::genCompactInc(VPTransformState &State) { + auto &Builder = State.Builder; + Module *M = getUnderlyingInstr()->getModule(); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Mask = State.get(getMask(), Part); + Constant *PTrue = ConstantInt::getTrue(cast(Mask->getType())); + Value *CNTPCall = createTargetCNTP(Builder, M, TTI, PTrue, Mask); + Value *Idx = nullptr; + if (Part == 0) + Idx = State.get(getOperand(0), Part); + else + Idx = State.get(this, Part - 1); + Value *TruncCall = CNTPCall; + if (Idx->getType() != CNTPCall->getType()) { + TruncCall = Builder.CreateTrunc(CNTPCall, Idx->getType()); + } + Value *NewInc = Builder.CreateAdd(cast(Idx), TruncCall); + State.set(this, NewInc, Part); + } +} + +void VPWidenCompactInstructionRecipe::genCompactLiveOut( + VPTransformState &State) { + // Get the exit value of phi + VPValue *VPExitValue = nullptr; + PHINode *Phi = cast(&Ingredient); + for (unsigned Idx = 0; Idx < Phi->getNumIncomingValues(); Idx++) { + PHINode *PhiOp = + dyn_cast_or_null(getOperand(Idx)->getUnderlyingValue()); + if (!PhiOp) { + VPExitValue = getOperand(Idx); + break; + } + } + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *ExitVal = State.get(VPExitValue, Part); + State.set(this, ExitVal, Part); + } +} + +void VPCompactPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "COMPACT-PHI "; + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} + +void VPCompactPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + BasicBlock *VectorHeader = State.CFG.PrevBB; + VPValue *StartVPV = getStartValue(); + Value *Start = StartVPV->getLiveInIRValue(); + for (unsigned Part = 0; Part < State.UF; ++Part) { + PHINode *Entry = PHINode::Create(Start->getType(), 2, "compact.iv", + &*VectorHeader->getFirstInsertionPt()); + Entry->addIncoming(Start, VectorPH); + State.set(this, Entry, Part); + } +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -348,6 +348,7 @@ VPWidenCastSC, VPWidenGEPSC, VPWidenMemoryInstructionSC, + VPCompactInstructionSC, VPWidenSC, VPWidenSelectSC, // START: Phi-like recipes. Need to be kept together. @@ -361,6 +362,7 @@ VPWidenPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, + VPCompactPHISC, VPReductionPHISC, // END: SubclassID for recipes that inherit VPHeaderPHIRecipe // END: Phi-like recipes diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll @@ -0,0 +1,78 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -debug -disable-output %s 2>&1 < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; CHECK-LABEL: 'kernel_reference' +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = vector-trip-count +; CHECK-NEXT: vp<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ph: +; CHECK-NEXT: EMIT vp<%1> = EXPAND SCEV (zext i32 %N to i64) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: COMPACT-PHI ir<%n.013> = phi ir<0>, ir<%n.1> +; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%comp>, vp<%4> +; CHECK-NEXT: WIDEN ir<%0> = load ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%0>, ir<%a> +; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%B>, vp<%4> +; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx3>, ir<%cmp1> +; CHECK-NEXT: COMPACT ir<%inc> = add ir<%n.013>, ir<1> +; CHECK-NEXT: CLONE ir<%idxprom4> = sext ir<%n.013> +; CHECK-NEXT: CLONE ir<%arrayidx5> = getelementptr inbounds ir<%Out_ref>, ir<%idxprom4> +; CHECK-NEXT: COMPACT store ir<%1>, ir<%arrayidx5> +; CHECK-NEXT: COMPACT ir<%n.1> = phi ir<%inc>, ir<%n.013> +; CHECK-NEXT: EMIT vp<%15> = VF * UF + nuw vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%15>, vp<%0> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; +; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) +define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 { +entry: + %cmp11 = icmp sgt i32 %N, 0 + br i1 %cmp11, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %n.013 = phi i32 [ 0, %for.body.preheader ], [ %n.1, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %comp, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, %a + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx3, align 4 + %inc = add nsw i32 %n.013, 1 + %idxprom4 = sext i32 %n.013 to i64 + %arrayidx5 = getelementptr inbounds i32, ptr %Out_ref, i64 %idxprom4 + store i32 %1, ptr %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %n.1 = phi i32 [ %inc, %if.then ], [ %n.013, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.inc, %entry + %n.0.lcssa = phi i32 [ 0, %entry ], [ %n.1, %for.inc ] + ret i32 %n.0.lcssa +} + +attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8.2a"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll @@ -4,6 +4,12 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" +; for (i = 0; i < N; i++){ +; x = comp[i]; +; if(x < a) Out[n++] = B[i]; +; } +; return n; + ; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 { ; CHECK-LABEL: @kernel_reference( @@ -12,29 +18,97 @@ ; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[COMPACT_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp slt [[WIDE_LOAD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, [[TMP16]], poison) +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP22]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, [[TMP17]], poison) +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP16]]) +; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[COMPACT_IV]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[TMP17]]) +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; CHECK-NEXT: [[TMP29]] = add i32 [[TMP26]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = sext i32 [[COMPACT_IV]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = sext i32 [[TMP26]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = call @llvm.aarch64.sve.compact.nxv4i32( [[TMP16]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq [[TMP34]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP36:%.*]] = call @llvm.aarch64.sve.compact.nxv4i32( [[TMP16]], [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP36]], ptr [[TMP32]], i32 4, [[TMP35]]) +; CHECK-NEXT: [[TMP37:%.*]] = call @llvm.aarch64.sve.compact.nxv4i32( [[TMP17]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq [[TMP37]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP39:%.*]] = call @llvm.aarch64.sve.compact.nxv4i32( [[TMP17]], [[WIDE_MASKED_LOAD3]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP39]], ptr [[TMP33]], i32 4, [[TMP38]]) +; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[COMPACT_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[N_013:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_1:%.*]], [[FOR_INC]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[A:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[N_013:%.*]] = phi i32 [ [[COMPACT_RDX]], [[SCALAR_PH]] ], [ [[N_1:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP43]], [[A]] ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[N_013]], 1 ; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[N_013]] to i64 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[IDXPROM4]] -; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[IDXPROM4]] +; CHECK-NEXT: store i32 [[TMP44]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[N_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[N_013]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ] +; CHECK-NEXT: [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[N_1_LCSSA]], [[FOR_END_LOOPEXIT]] ]