diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -39,6 +39,39 @@ // T *p = array[-1]; // for (int i = 0; i < n; ++i) // *++p = c; +// +// 3: common multiple chains for the load/stores with same offsets in the loop, +// so that we can reuse the offsets and reduce the register pressure in the +// loop. This transformation can also increase the loop ILP as now each chain +// uses its own loop induction add/addi. But this will increase the number of +// add/addi in the loop. +// +// Generically, this means transforming loops like this: +// +// char *p; +// A1 = p + base1 +// A2 = p + base1 + offset +// B1 = p + base2 +// B2 = p + base2 + offset +// +// for (int i = 0; i < n; i++) +// unsigned long x1 = *(unsigned long *)(A1 + i); +// unsigned long x2 = *(unsigned long *)(A2 + i) +// unsigned long x3 = *(unsigned long *)(B1 + i); +// unsigned long x4 = *(unsigned long *)(B2 + i); +// } +// +// to look like this: +// +// A1’ = p + base1 // chain 1 +// B1’ = p + base2 // chain 2, now inside the loop, offset is reused. +// +// for (long long i = 0; i < n; i+=count) { +// unsigned long x1 = *(unsigned long *)(A1’ + i); +// unsigned long x2 = *(unsigned long *)(A1’ + offset + i); +// unsigned long x3 = *(unsigned long *)(B1’ + i); +// unsigned long x4 = *(unsigned long *)(B1’ + offset + i); +// } //===----------------------------------------------------------------------===// #include "PPC.h" @@ -90,6 +123,10 @@ cl::init(true), cl::Hidden, cl::desc("prefer update form when ds form is also a update form")); +static cl::opt EnableChainCommoning( + "ppc-formprep-chain-commoning", cl::init(true), cl::Hidden, + cl::desc("Enable chain commoning in PPC loop prepare pass.")); + // Sum of following 3 per loop thresholds for all loops can not be larger // than MaxVarsPrep. // now the thresholds for each kind prep are exterimental values on Power9. @@ -125,19 +162,32 @@ namespace { struct BucketElement { - BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(const SCEV *O, Instruction *I) : Offset(O), Instr(I) {} BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} - const SCEVConstant *Offset; + const SCEV *Offset; Instruction *Instr; }; struct Bucket { - Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), - Elements(1, BucketElement(I)) {} + Bucket(const SCEV *B, Instruction *I) + : BaseSCEV(B), Elements(1, BucketElement(I)) { + ChainSize = 0; + } + // The base of the whole bucket. const SCEV *BaseSCEV; + + // All elements in the bucket. In the bucket, the element with the BaseSCEV + // has no offset and all other elements are stored as offsets to the + // BaseSCEV. SmallVector Elements; + + // The potential chains size. This is used for chain commoning only. + unsigned ChainSize; + + // The base for each potential chain. This is used for chain commoning only. + SmallVector ChainBases; }; // "UpdateForm" is not a real PPC instruction form, it stands for dform @@ -192,6 +242,24 @@ Value *getPreparedIncNode(Loop *L, Instruction *MemI, const SCEV *BasePtrIncSCEV); + /// Collect chain commoning load/store candidates in Loop \p L. + SmallVector collectCandidatesForChainCommoning(Loop *L); + + /// Add a candidate to candidates \p Buckets for chain commoning. + void addOneCandidateForChainCommoning(Instruction *MemI, const SCEV *LSCEV, + SmallVector &Buckets); + + /// Common chains to reuse offsets for a loop to reduce register pressure. + bool chainCommoning(Loop *L, SmallVector &Buckets); + + /// Find out the potential commoning chains and their bases. + bool prepareBasesForCommoningChains(Bucket &BucketChain); + + /// Rewrite load/store according to the common chains. + bool + rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket, + SmallSet &BBChanged); + /// Collect condition matched(\p isValidCandidate() returns true) /// candidates in Loop \p L. SmallVector collectCandidates( @@ -234,6 +302,19 @@ bool rewriteLoadStores(Loop *L, Bucket &BucketChain, SmallSet &BBChanged, InstrForm Form); + + /// Rewrite for the base load/store of a chain. + std::pair + rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV, + Instruction *BaseMemI, bool CanPreInc, InstrForm Form, + SCEVExpander &SCEVE, SmallPtrSet &DeletedPtrs); + + /// Rewrite for the other load/stores of a chain according to the new \p + /// Base. + Instruction * + rewriteForBucketElement(std::pair Base, + const BucketElement &Element, Value *OffToBase, + SmallPtrSet &DeletedPtrs); }; } // end anonymous namespace @@ -321,6 +402,485 @@ return MadeChange; } +// check if the SCEV is only with one ptr operand in its start, so that we can +// use that start as a chain separator. +static bool isValidChainCommoningCandidate(const SCEV *LSCEV) { + const SCEVAddRecExpr *ARSCEV = cast(LSCEV); + if (!ARSCEV) + return false; + + if (!ARSCEV->isAffine()) + return false; + + const SCEV *Start = ARSCEV->getStart(); + + // A single pointer. We can treat it with offset 0. + if (isa(Start) && Start->getType()->isPointerTy()) + return true; + + const SCEVAddExpr *ASCEV = dyn_cast(Start); + + // We need a SCEVAddExpr to include both base and offset. + if (!ASCEV) + return false; + + // Make sure there is only one pointer operand(base) and all other operands + // are integer type. + bool SawPointer = false; + for (const SCEV *Op : ASCEV->operands()) { + if (Op->getType()->isPointerTy()) { + if (SawPointer) + return false; + SawPointer = true; + } else if (!Op->getType()->isIntegerTy()) + return false; + } + + return SawPointer; +} + +// Make sure the diff between the base and new candidate is required type. +static bool isValidChainCommoningDiff(const SCEV *LSCEV) { + assert(LSCEV && "Invalid SCEV for Ptr value."); + + // Don't mess up previous dform prepare. + if (isa(LSCEV)) + return false; + + // A single integer type offset. + if (isa(LSCEV) && LSCEV->getType()->isIntegerTy()) + return true; + + const SCEVNAryExpr *ASCEV = dyn_cast(LSCEV); + if (!ASCEV) + return false; + + for (const SCEV *Op : ASCEV->operands()) + if (!Op->getType()->isIntegerTy()) + return false; + + return true; +} + +void PPCLoopInstrFormPrep::addOneCandidateForChainCommoning( + Instruction *MemI, const SCEV *LSCEV, SmallVector &Buckets) { + assert((MemI && getPointerOperandAndType(MemI)) && + "Candidate should be a memory instruction."); + assert(LSCEV && "Invalid SCEV for Ptr value."); + + if (!isValidChainCommoningCandidate(LSCEV)) + return; + + bool FoundBucket = false; + for (auto &B : Buckets) { + if (cast(B.BaseSCEV)->getStepRecurrence(*SE) != + cast(LSCEV)->getStepRecurrence(*SE)) + continue; + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (isValidChainCommoningDiff(Diff)) { + B.Elements.push_back(BucketElement(Diff, MemI)); + FoundBucket = true; + break; + } + } + + if (!FoundBucket) + Buckets.push_back(Bucket(LSCEV, MemI)); +} + +SmallVector +PPCLoopInstrFormPrep::collectCandidatesForChainCommoning(Loop *L) { + SmallVector Buckets; + + if (!EnableChainCommoning) + return Buckets; + + for (const auto &BB : L->blocks()) + for (auto &J : *BB) { + Value *PtrValue = getPointerOperandAndType(&J); + if (!PtrValue) + continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); + const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); + if (!LARSCEV || LARSCEV->getLoop() != L) + continue; + + addOneCandidateForChainCommoning(&J, LSCEV, Buckets); + } + return Buckets; +} + +bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) { + // It is profitable to have at least 4 elements in the chain bucket. + assert(CBucket.Elements.size() >= 4 && "Not a candidate bucket!\n"); + + const SCEV *Offset = CBucket.Elements[1].Offset; + unsigned TotalCount = 1; + unsigned FirstGroupCount = 1; + unsigned EleNum = CBucket.Elements.size(); + bool SawSeparater = false; + for (unsigned j = 2; j != EleNum; ++j) { + if (SE->getMinusSCEV(CBucket.Elements[j].Offset, + CBucket.Elements[j - 1].Offset) == Offset) { + if (!SawSeparater) + FirstGroupCount++; + TotalCount++; + } else + SawSeparater = true; + } + + // No reuseable offset, skip this bucket. + if (TotalCount == 1) + return false; + + // All elements are increased by Offset. + // The number of new bases should both be sqrt(EleNum). + if (!SawSeparater) { + unsigned ChainNum = (unsigned)sqrt(EleNum); + CBucket.ChainSize = (unsigned)(EleNum / ChainNum); + + // If this is not a perfect chain(eg: all elements can be put inside + // commoning chains.), skip now. + if (CBucket.ChainSize * ChainNum != EleNum) + return false; + + for (unsigned i = 0; i < ChainNum; i++) + CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]); + return true; + } + + unsigned ChainNum = TotalCount / FirstGroupCount; + CBucket.ChainSize = EleNum / ChainNum; + + // All Elements can be put inside commoning chains. + if (CBucket.ChainSize * ChainNum != EleNum) + return false; + + // Check each chain has same offsets for all elements. + for (unsigned i = 1; i < CBucket.ChainSize; i++) + for (unsigned j = 1; j < ChainNum; j++) + if (CBucket.Elements[i].Offset != + SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset, + CBucket.Elements[j * CBucket.ChainSize].Offset)) + return false; + + for (unsigned i = 0; i < ChainNum; i++) + CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]); + + return true; +} + +bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, + SmallVector &Buckets) { + bool MadeChange = false; + + if (Buckets.empty()) + return MadeChange; + + SmallSet BBChanged; + + for (auto &Bucket : Buckets) { + // The minimal size for profitable chain commoning: + // A1 = base + offset1 + // A2 = base + offset2 (offset2 - offset1 = X) + // A3 = base + offset3 + // A4 = base + offset4 (offset4 - offset3 = X) + // ======> + // base1 = base + offset1 + // base2 = base + offset3 + // A1 = base1 + // A2 = base1 + X + // A3 = base2 + // A4 = base2 + X + // + // There is benefit because of reuse of offest 'X'. + if (Bucket.Elements.size() < 4) + continue; + + if (prepareBasesForCommoningChains(Bucket)) + MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged); + } + + if (MadeChange) + for (auto &BB : L->blocks()) + if (BBChanged.count(BB)) + DeleteDeadPHIs(BB); + return MadeChange; +} + +bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( + Loop *L, Bucket &Bucket, SmallSet &BBChanged) { + bool MadeChange = false; + + assert(Bucket.Elements.size() == + Bucket.ChainBases.size() * Bucket.ChainSize && + "invalid bucket for chain commoning!\n"); + SmallPtrSet DeletedPtrs; + + // Make sure each offset is able to expand. + for (unsigned Idx = 1; Idx < Bucket.ChainSize; ++Idx) + if (!isSafeToExpand(Bucket.Elements[Idx].Offset, *SE)) + return MadeChange; + + // Make sure each base is able to expand. + for (unsigned Idx = 0; Idx < Bucket.ChainBases.size(); ++Idx) { + const SCEV *BaseSCEV = + Idx ? SE->getAddExpr(Bucket.BaseSCEV, Bucket.ChainBases[Idx].Offset) + : Bucket.BaseSCEV; + const SCEVAddRecExpr *BasePtrSCEV = cast(BaseSCEV); + if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE)) + return MadeChange; + } + + BasicBlock *Header = L->getHeader(); + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + + Type *I64Ty = Type::getInt64Ty(Header->getContext()); + + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); + + for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) { + unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx; + const SCEV *BaseSCEV = + ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV, + Bucket.Elements[BaseElemIdx].Offset) + : Bucket.BaseSCEV; + const SCEVAddRecExpr *BasePtrSCEV = cast(BaseSCEV); + assert(BasePtrSCEV->isAffine() && + "Invalid SCEV type for the base ptr for a candidate chain!\n"); + + std::pair Base = + rewriteForBase(L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr, + false /* CanPreInc */, UpdateForm, SCEVE, DeletedPtrs); + + if (!Base.first || !Base.second) + return MadeChange; + + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet NewPtrs; + NewPtrs.insert(Base.first); + + for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize; + ++Idx) { + BucketElement &I = Bucket.Elements[Idx]; + Value *Ptr = getPointerOperandAndType(I.Instr); + assert(Ptr && "No pointer operand"); + if (NewPtrs.count(Ptr)) + continue; + + const SCEV *OffsetSCEV = + BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset, + Bucket.Elements[BaseElemIdx].Offset) + : Bucket.Elements[Idx].Offset; + Value *OffsetValue = SCEVE.expandCodeFor( + OffsetSCEV, I64Ty, LoopPredecessor->getTerminator()); + + Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx], + OffsetValue, DeletedPtrs); + + assert(NewPtr && "Wrong rewrite!\n"); + NewPtrs.insert(NewPtr); + } + } + + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted below, causing the AssertingVH in the cache to trigger. + SCEVE.clear(); + + for (auto *Ptr : DeletedPtrs) { + if (Instruction *IDel = dyn_cast(Ptr)) + BBChanged.insert(IDel->getParent()); + RecursivelyDeleteTriviallyDeadInstructions(Ptr); + } + + MadeChange = true; + return MadeChange; +} + +std::pair +PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV, + Instruction *BaseMemI, bool CanPreInc, + InstrForm Form, SCEVExpander &SCEVE, + SmallPtrSet &DeletedPtrs) { + + LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); + + assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); + + Value *BasePtr = getPointerOperandAndType(BaseMemI); + assert(BasePtr && "No pointer operand"); + + Type *I8Ty = Type::getInt8Ty(BaseMemI->getParent()->getContext()); + Type *I8PtrTy = + Type::getInt8PtrTy(BaseMemI->getParent()->getContext(), + BasePtr->getType()->getPointerAddressSpace()); + + bool IsConstantInc = false; + const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE); + Value *IncNode = getPreparedIncNode(L, BaseMemI, BasePtrIncSCEV); + + const SCEVConstant *BasePtrIncConstantSCEV = + dyn_cast(BasePtrIncSCEV); + if (BasePtrIncConstantSCEV) + IsConstantInc = true; + + // No valid representation for the increment. + if (!IncNode) { + LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n"); + return std::make_pair(nullptr, nullptr); + } + + const SCEV *BasePtrStartSCEV = nullptr; + if (CanPreInc) { + assert(SE->isLoopInvariant(BasePtrIncSCEV, L) && + "Increment is not loop invariant!\n"); + BasePtrStartSCEV = SE->getMinusSCEV(BasePtrSCEV->getStart(), + IsConstantInc ? BasePtrIncConstantSCEV + : BasePtrIncSCEV); + } else + BasePtrStartSCEV = BasePtrSCEV->getStart(); + + if (alreadyPrepared(L, BaseMemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) { + LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n"); + return std::make_pair(nullptr, nullptr); + } + + LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); + + BasicBlock *Header = L->getHeader(); + unsigned HeaderLoopPredCount = pred_size(Header); + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + + PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, + getInstrName(BaseMemI, PHINodeNameSuffix), + Header->getFirstNonPHI()); + + Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, + LoopPredecessor->getTerminator()); + + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to add it the right number of times. + for (auto PI : predecessors(Header)) { + if (PI != LoopPredecessor) + continue; + + NewPHI->addIncoming(BasePtrStart, LoopPredecessor); + } + + Instruction *PtrInc = nullptr; + Instruction *NewBasePtr = nullptr; + if (CanPreInc) { + Instruction *InsPoint = &*Header->getFirstInsertionPt(); + PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix), + InsPoint); + cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); + for (auto PI : predecessors(Header)) { + if (PI == LoopPredecessor) + continue; + + NewPHI->addIncoming(PtrInc, PI); + } + if (PtrInc->getType() != BasePtr->getType()) + NewBasePtr = + new BitCastInst(PtrInc, BasePtr->getType(), + getInstrName(PtrInc, CastNodeNameSuffix), InsPoint); + else + NewBasePtr = PtrInc; + } else { + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to make sure no more incoming value for them in PHI. + for (auto PI : predecessors(Header)) { + if (PI == LoopPredecessor) + continue; + + // For the latch predecessor, we need to insert a GEP just before the + // terminator to increase the address. + BasicBlock *BB = PI; + Instruction *InsPoint = BB->getTerminator(); + PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix), + InsPoint); + cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); + + NewPHI->addIncoming(PtrInc, PI); + } + PtrInc = NewPHI; + if (NewPHI->getType() != BasePtr->getType()) + NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(), + getInstrName(NewPHI, CastNodeNameSuffix), + &*Header->getFirstInsertionPt()); + else + NewBasePtr = NewPHI; + } + + BasePtr->replaceAllUsesWith(NewBasePtr); + + DeletedPtrs.insert(BasePtr); + + return std::make_pair(NewBasePtr, PtrInc); +} + +Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement( + std::pair Base, const BucketElement &Element, + Value *OffToBase, SmallPtrSet &DeletedPtrs) { + Instruction *NewBasePtr = Base.first; + Instruction *PtrInc = Base.second; + assert((NewBasePtr && PtrInc) && "base does not exist!\n"); + + Type *I8Ty = Type::getInt8Ty(PtrInc->getParent()->getContext()); + + Value *Ptr = getPointerOperandAndType(Element.Instr); + assert(Ptr && "No pointer operand"); + + Instruction *RealNewPtr; + if (!Element.Offset || + (isa(Element.Offset) && + cast(Element.Offset)->getValue()->isZero())) { + RealNewPtr = NewBasePtr; + } else { + Instruction *PtrIP = dyn_cast(Ptr); + if (PtrIP && isa(NewBasePtr) && + cast(NewBasePtr)->getParent() == PtrIP->getParent()) + PtrIP = nullptr; + else if (PtrIP && isa(PtrIP)) + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); + else if (!PtrIP) + PtrIP = Element.Instr; + + assert(OffToBase && "There should be an offset for non base element!\n"); + GetElementPtrInst *NewPtr = GetElementPtrInst::Create( + I8Ty, PtrInc, OffToBase, + getInstrName(Element.Instr, GEPNodeOffNameSuffix), PtrIP); + if (!PtrIP) + NewPtr->insertAfter(cast(PtrInc)); + NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); + RealNewPtr = NewPtr; + } + + Instruction *ReplNewPtr; + if (Ptr->getType() != RealNewPtr->getType()) { + ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), + getInstrName(Ptr, CastNodeNameSuffix)); + ReplNewPtr->insertAfter(RealNewPtr); + } else + ReplNewPtr = RealNewPtr; + + Ptr->replaceAllUsesWith(ReplNewPtr); + DeletedPtrs.insert(Ptr); + + return ReplNewPtr; +} + void PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV, SmallVector &Buckets, unsigned MaxCandidateNum) { @@ -391,7 +951,7 @@ RemainderOffsetInfo[0] = std::make_pair(0, 1); else { unsigned Remainder = - BucketChain.Elements[j].Offset->getAPInt().urem(Form); + cast(BucketChain.Elements[j].Offset)->getAPInt().urem(Form); if (RemainderOffsetInfo.find(Remainder) == RemainderOffsetInfo.end()) RemainderOffsetInfo[Remainder] = std::make_pair(j, 1); else @@ -473,7 +1033,7 @@ // If our chosen element has no offset from the base pointer, there's // nothing to do. if (!BucketChain.Elements[j].Offset || - BucketChain.Elements[j].Offset->isZero()) + cast(BucketChain.Elements[j].Offset)->isZero()) break; const SCEV *Offset = BucketChain.Elements[j].Offset; @@ -491,157 +1051,45 @@ return true; } -bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain, - SmallSet &BBChanged, - InstrForm Form) { +bool PPCLoopInstrFormPrep::rewriteLoadStores( + Loop *L, Bucket &BucketChain, SmallSet &BBChanged, + InstrForm Form) { bool MadeChange = false; const SCEVAddRecExpr *BasePtrSCEV = cast(BucketChain.BaseSCEV); if (!BasePtrSCEV->isAffine()) return MadeChange; - LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); - - assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); - - // The instruction corresponding to the Bucket's BaseSCEV must be the first - // in the vector of elements. - Instruction *MemI = BucketChain.Elements.begin()->Instr; - Value *BasePtr = getPointerOperandAndType(MemI); - assert(BasePtr && "No pointer operand"); - - Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); - Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), - BasePtr->getType()->getPointerAddressSpace()); - - if (!SE->isLoopInvariant(BasePtrSCEV->getStart(), L)) + if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE)) return MadeChange; - bool IsConstantInc = false; - const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE); - Value *IncNode = getPreparedIncNode(L, MemI, BasePtrIncSCEV); + SmallPtrSet DeletedPtrs; - const SCEVConstant *BasePtrIncConstantSCEV = - dyn_cast(BasePtrIncSCEV); - if (BasePtrIncConstantSCEV) - IsConstantInc = true; - - // No valid representation for the increment. - if (!IncNode) { - LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n"); - return MadeChange; - } + BasicBlock *Header = L->getHeader(); + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); // For some DS form load/store instructions, it can also be an update form, // if the stride is constant and is a multipler of 4. Use update form if // prefer it. - bool CanPreInc = - (Form == UpdateForm || - ((Form == DSForm) && IsConstantInc && - !BasePtrIncConstantSCEV->getAPInt().urem(4) && PreferUpdateForm)); - const SCEV *BasePtrStartSCEV = nullptr; - if (CanPreInc) { - assert(SE->isLoopInvariant(BasePtrIncSCEV, L) && - "Increment is not loop invariant!\n"); - BasePtrStartSCEV = SE->getMinusSCEV(BasePtrSCEV->getStart(), - IsConstantInc ? BasePtrIncConstantSCEV - : BasePtrIncSCEV); - } else - BasePtrStartSCEV = BasePtrSCEV->getStart(); - - if (!isSafeToExpand(BasePtrStartSCEV, *SE)) - return MadeChange; - - if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) { - LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n"); + bool CanPreInc = (Form == UpdateForm || + ((Form == DSForm) && + isa(BasePtrSCEV->getStepRecurrence(*SE)) && + !cast(BasePtrSCEV->getStepRecurrence(*SE)) + ->getAPInt() + .urem(4) && + PreferUpdateForm)); + + std::pair Base = + rewriteForBase(L, BasePtrSCEV, BucketChain.Elements.begin()->Instr, + CanPreInc, Form, SCEVE, DeletedPtrs); + + if (!Base.first || !Base.second) return MadeChange; - } - - LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); - - BasicBlock *Header = L->getHeader(); - unsigned HeaderLoopPredCount = pred_size(Header); - BasicBlock *LoopPredecessor = L->getLoopPredecessor(); - - PHINode *NewPHI = - PHINode::Create(I8PtrTy, HeaderLoopPredCount, - getInstrName(MemI, PHINodeNameSuffix), - Header->getFirstNonPHI()); - - SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); - Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, - LoopPredecessor->getTerminator()); - - // Note that LoopPredecessor might occur in the predecessor list multiple - // times, and we need to add it the right number of times. - for (auto PI : predecessors(Header)) { - if (PI != LoopPredecessor) - continue; - - NewPHI->addIncoming(BasePtrStart, LoopPredecessor); - } - - Instruction *PtrInc = nullptr; - Instruction *NewBasePtr = nullptr; - if (CanPreInc) { - Instruction *InsPoint = &*Header->getFirstInsertionPt(); - PtrInc = GetElementPtrInst::Create(I8Ty, NewPHI, IncNode, - getInstrName(MemI, GEPNodeIncNameSuffix), - InsPoint); - cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); - for (auto PI : predecessors(Header)) { - if (PI == LoopPredecessor) - continue; - - NewPHI->addIncoming(PtrInc, PI); - } - if (PtrInc->getType() != BasePtr->getType()) - NewBasePtr = new BitCastInst( - PtrInc, BasePtr->getType(), - getInstrName(PtrInc, CastNodeNameSuffix), InsPoint); - else - NewBasePtr = PtrInc; - } else { - // Note that LoopPredecessor might occur in the predecessor list multiple - // times, and we need to make sure no more incoming value for them in PHI. - for (auto PI : predecessors(Header)) { - if (PI == LoopPredecessor) - continue; - - // For the latch predecessor, we need to insert a GEP just before the - // terminator to increase the address. - BasicBlock *BB = PI; - Instruction *InsPoint = BB->getTerminator(); - PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, IncNode, getInstrName(MemI, GEPNodeIncNameSuffix), - InsPoint); - cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); - - NewPHI->addIncoming(PtrInc, PI); - } - PtrInc = NewPHI; - if (NewPHI->getType() != BasePtr->getType()) - NewBasePtr = - new BitCastInst(NewPHI, BasePtr->getType(), - getInstrName(NewPHI, CastNodeNameSuffix), - &*Header->getFirstInsertionPt()); - else - NewBasePtr = NewPHI; - } - - // Clear the rewriter cache, because values that are in the rewriter's cache - // can be deleted below, causing the AssertingVH in the cache to trigger. - SCEVE.clear(); - - if (Instruction *IDel = dyn_cast(BasePtr)) - BBChanged.insert(IDel->getParent()); - BasePtr->replaceAllUsesWith(NewBasePtr); - RecursivelyDeleteTriviallyDeadInstructions(BasePtr); // Keep track of the replacement pointer values we've inserted so that we // don't generate more pointer values than necessary. SmallPtrSet NewPtrs; - NewPtrs.insert(NewBasePtr); + NewPtrs.insert(Base.first); for (auto I = std::next(BucketChain.Elements.begin()), IE = BucketChain.Elements.end(); I != IE; ++I) { @@ -649,44 +1097,22 @@ assert(Ptr && "No pointer operand"); if (NewPtrs.count(Ptr)) continue; + Instruction *NewPtr = rewriteForBucketElement( + Base, *I, + I->Offset ? cast(I->Offset)->getValue() : nullptr, + DeletedPtrs); + assert(NewPtr && "wrong rewrite!\n"); + NewPtrs.insert(NewPtr); + } - Instruction *RealNewPtr; - if (!I->Offset || I->Offset->getValue()->isZero()) { - RealNewPtr = NewBasePtr; - } else { - Instruction *PtrIP = dyn_cast(Ptr); - if (PtrIP && isa(NewBasePtr) && - cast(NewBasePtr)->getParent() == PtrIP->getParent()) - PtrIP = nullptr; - else if (PtrIP && isa(PtrIP)) - PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); - else if (!PtrIP) - PtrIP = I->Instr; - - GetElementPtrInst *NewPtr = GetElementPtrInst::Create( - I8Ty, PtrInc, I->Offset->getValue(), - getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP); - if (!PtrIP) - NewPtr->insertAfter(cast(PtrInc)); - NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); - RealNewPtr = NewPtr; - } + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted below, causing the AssertingVH in the cache to trigger. + SCEVE.clear(); + for (auto *Ptr : DeletedPtrs) { if (Instruction *IDel = dyn_cast(Ptr)) BBChanged.insert(IDel->getParent()); - - Instruction *ReplNewPtr; - if (Ptr->getType() != RealNewPtr->getType()) { - ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), - getInstrName(Ptr, CastNodeNameSuffix)); - ReplNewPtr->insertAfter(RealNewPtr); - } else - ReplNewPtr = RealNewPtr; - - Ptr->replaceAllUsesWith(ReplNewPtr); RecursivelyDeleteTriviallyDeadInstructions(Ptr); - - NewPtrs.insert(RealNewPtr); } MadeChange = true; @@ -1004,5 +1430,14 @@ if (!DQFormBuckets.empty()) MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm); + // Collect buckets of comparable addresses used by loads and stores for chain + // commoning. With chain commoning, we reuses offsets between the chains, so + // the register pressure will be reduced. + SmallVector Buckets = collectCandidatesForChainCommoning(L); + + // Prepare for chain commoning. + if (!Buckets.empty()) + MadeChange |= chainCommoning(L, Buckets); + return MadeChange; } diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll --- a/llvm/test/CodeGen/PowerPC/common-chain.ll +++ b/llvm/test/CodeGen/PowerPC/common-chain.ll @@ -38,23 +38,26 @@ ; CHECK-NEXT: cmpdi r6, 1 ; CHECK-NEXT: blt cr0, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r8, r4, 1 +; CHECK-NEXT: sldi r7, r4, 1 ; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: add r8, r4, r7 +; CHECK-NEXT: add r7, r5, r4 +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: add r7, r3, r7 ; CHECK-NEXT: add r5, r3, r5 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: sldi r7, r4, 2 -; CHECK-NEXT: add r9, r4, r8 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: ldx r6, r5, r4 -; CHECK-NEXT: ldx r10, r5, r8 -; CHECK-NEXT: ldx r11, r5, r9 -; CHECK-NEXT: ldx r12, r5, r7 +; CHECK-NEXT: ld r6, 0(r7) +; CHECK-NEXT: ldx r8, r7, r4 +; CHECK-NEXT: ld r9, 0(r5) +; CHECK-NEXT: ldx r10, r5, r4 +; CHECK-NEXT: addi r7, r7, 1 ; CHECK-NEXT: addi r5, r5, 1 -; CHECK-NEXT: mulld r6, r10, r6 -; CHECK-NEXT: mulld r6, r6, r11 -; CHECK-NEXT: maddld r3, r6, r12, r3 +; CHECK-NEXT: mulld r6, r8, r6 +; CHECK-NEXT: mulld r6, r6, r9 +; CHECK-NEXT: maddld r3, r6, r10, r3 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-NEXT: blr @@ -564,24 +567,26 @@ ; CHECK-NEXT: cmpdi r6, 1 ; CHECK-NEXT: blt cr0, .LBB5_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: mulli r7, r4, 6 -; CHECK-NEXT: add r5, r3, r5 -; CHECK-NEXT: sldi r3, r4, 1 -; CHECK-NEXT: add r9, r4, r3 -; CHECK-NEXT: mtctr r6 ; CHECK-NEXT: sldi r8, r4, 2 +; CHECK-NEXT: add r7, r5, r4 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: add r7, r3, r7 +; CHECK-NEXT: sldi r4, r4, 1 +; CHECK-NEXT: add r5, r3, r5 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: ldx r6, r5, r4 -; CHECK-NEXT: ldx r10, r5, r9 -; CHECK-NEXT: ldx r11, r5, r8 -; CHECK-NEXT: ldx r12, r5, r7 +; CHECK-NEXT: ld r6, 0(r7) +; CHECK-NEXT: ldx r8, r7, r4 +; CHECK-NEXT: ld r9, 0(r5) +; CHECK-NEXT: ldx r10, r5, r4 +; CHECK-NEXT: addi r7, r7, 1 ; CHECK-NEXT: addi r5, r5, 1 -; CHECK-NEXT: mulld r6, r10, r6 -; CHECK-NEXT: mulld r6, r6, r11 -; CHECK-NEXT: maddld r3, r6, r12, r3 +; CHECK-NEXT: mulld r6, r8, r6 +; CHECK-NEXT: mulld r6, r6, r9 +; CHECK-NEXT: maddld r3, r6, r10, r3 ; CHECK-NEXT: bdnz .LBB5_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-NEXT: blr @@ -664,32 +669,30 @@ ; CHECK-NEXT: cmpdi r7, 1 ; CHECK-NEXT: blt cr0, .LBB6_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r8, r4, 1 -; CHECK-NEXT: mtctr r7 -; CHECK-NEXT: add r9, r4, r8 -; CHECK-NEXT: add r8, r6, r9 ; CHECK-NEXT: add r6, r6, r4 -; CHECK-NEXT: add r9, r5, r9 ; CHECK-NEXT: add r5, r5, r4 -; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: sldi r4, r4, 1 +; CHECK-NEXT: add r5, r3, r5 +; CHECK-NEXT: add r6, r3, r6 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: ldx r7, r3, r5 -; CHECK-NEXT: ldx r10, r3, r9 -; CHECK-NEXT: ldx r11, r3, r6 -; CHECK-NEXT: ldx r12, r3, r8 -; CHECK-NEXT: addi r3, r3, 1 -; CHECK-NEXT: mulld r7, r10, r7 -; CHECK-NEXT: mulld r7, r7, r11 -; CHECK-NEXT: maddld r4, r7, r12, r4 +; CHECK-NEXT: ld r7, 0(r5) +; CHECK-NEXT: ldx r8, r5, r4 +; CHECK-NEXT: ld r9, 0(r6) +; CHECK-NEXT: ldx r10, r6, r4 +; CHECK-NEXT: addi r5, r5, 1 +; CHECK-NEXT: addi r6, r6, 1 +; CHECK-NEXT: mulld r7, r8, r7 +; CHECK-NEXT: mulld r7, r7, r9 +; CHECK-NEXT: maddld r3, r7, r10, r3 ; CHECK-NEXT: bdnz .LBB6_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup -; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB6_4: -; CHECK-NEXT: li r4, 0 -; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: blr entry: %mul = mul nsw i64 %offset, 3 @@ -748,328 +751,272 @@ define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) { ; CHECK-LABEL: spill_reduce_succ: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stdu r1, -336(r1) -; CHECK-NEXT: .cfi_def_cfa_offset 336 -; CHECK-NEXT: .cfi_offset r14, -144 -; CHECK-NEXT: .cfi_offset r15, -136 -; CHECK-NEXT: .cfi_offset r16, -128 -; CHECK-NEXT: .cfi_offset r17, -120 -; CHECK-NEXT: .cfi_offset r18, -112 -; CHECK-NEXT: .cfi_offset r19, -104 -; CHECK-NEXT: .cfi_offset r20, -96 -; CHECK-NEXT: .cfi_offset r21, -88 -; CHECK-NEXT: .cfi_offset r22, -80 -; CHECK-NEXT: .cfi_offset r23, -72 -; CHECK-NEXT: .cfi_offset r24, -64 -; CHECK-NEXT: .cfi_offset r25, -56 -; CHECK-NEXT: .cfi_offset r26, -48 -; CHECK-NEXT: .cfi_offset r27, -40 -; CHECK-NEXT: .cfi_offset r28, -32 -; CHECK-NEXT: .cfi_offset r29, -24 -; CHECK-NEXT: .cfi_offset r30, -16 -; CHECK-NEXT: .cfi_offset r31, -8 -; CHECK-NEXT: .cfi_offset r2, -152 ; CHECK-NEXT: cmpdi r6, 1 -; CHECK-NEXT: std r14, 192(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r15, 200(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r16, 208(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r17, 216(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r18, 224(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r19, 232(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r20, 240(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r21, 248(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r22, 256(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r23, 264(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r24, 272(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r25, 280(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r26, 288(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r27, 296(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r28, 304(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r29, 312(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r30, 320(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r31, 328(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r2, 184(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r9, 40(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r8, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r7, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r5, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r4, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r9, -176(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r8, -168(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r7, -160(r1) # 8-byte Folded Spill ; CHECK-NEXT: blt cr0, .LBB7_7 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r3, r6, 2 -; CHECK-NEXT: li r4, 1 -; CHECK-NEXT: mr r16, r10 -; CHECK-NEXT: cmpdi r3, 1 -; CHECK-NEXT: iselgt r3, r3, r4 -; CHECK-NEXT: addi r4, r3, -1 -; CHECK-NEXT: clrldi r6, r3, 63 -; CHECK-NEXT: cmpldi r4, 3 +; CHECK-NEXT: sldi r6, r6, 2 +; CHECK-NEXT: li r7, 1 +; CHECK-NEXT: mr r12, r10 +; CHECK-NEXT: cmpdi r6, 1 +; CHECK-NEXT: iselgt r7, r6, r7 +; CHECK-NEXT: addi r8, r7, -1 +; CHECK-NEXT: clrldi r6, r7, 63 +; CHECK-NEXT: cmpldi r8, 3 ; CHECK-NEXT: blt cr0, .LBB7_4 ; CHECK-NEXT: # %bb.2: # %for.body.preheader.new -; CHECK-NEXT: ld r30, 40(r1) # 8-byte Folded Reload -; CHECK-NEXT: sldi r4, r16, 2 -; CHECK-NEXT: ld r19, 80(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r21, 72(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r22, 56(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r27, 48(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r18, 64(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r5, r30, r4 -; CHECK-NEXT: rldicl r0, r3, 62, 2 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r11, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 168(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r27, r4 -; CHECK-NEXT: add r4, r18, r4 -; CHECK-NEXT: std r7, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r29, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: add r26, r19, r4 -; CHECK-NEXT: std r5, 152(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r21, r4 -; CHECK-NEXT: add r4, r22, r4 -; CHECK-NEXT: std r7, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r4, 136(r1) # 8-byte Folded Spill -; CHECK-NEXT: sldi r4, r16, 1 -; CHECK-NEXT: std r5, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r10, r16, r4 -; CHECK-NEXT: add r3, r18, r4 -; CHECK-NEXT: add r5, r30, r10 -; CHECK-NEXT: sldi r3, r3, 3 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r23, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r27, r10 -; CHECK-NEXT: std r7, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r20, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r18, r10 -; CHECK-NEXT: std r7, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: sub r10, r18, r10 -; CHECK-NEXT: sldi r5, r5, 3 +; CHECK-NEXT: rldicl r7, r7, 62, 2 +; CHECK-NEXT: sldi r10, r12, 2 +; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r31, -160(r1) # 8-byte Folded Reload +; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill +; CHECK-NEXT: mr r7, r4 +; CHECK-NEXT: ld r4, -176(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r8, r4, r10 +; CHECK-NEXT: sldi r8, r8, 3 +; CHECK-NEXT: add r9, r5, r8 +; CHECK-NEXT: add r8, r2, r10 +; CHECK-NEXT: add r10, r31, r10 ; CHECK-NEXT: sldi r10, r10, 3 -; CHECK-NEXT: add r17, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r30, r4 -; CHECK-NEXT: std r7, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r7, r19, r3 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r14, r19, r5 -; CHECK-NEXT: add r31, r21, r5 -; CHECK-NEXT: add r2, r22, r5 -; CHECK-NEXT: add r5, r27, r4 -; CHECK-NEXT: add r4, r22, r3 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r12, r19, r5 -; CHECK-NEXT: add r8, r21, r5 -; CHECK-NEXT: add r9, r22, r5 -; CHECK-NEXT: add r5, r21, r3 -; CHECK-NEXT: add r3, r16, r30 -; CHECK-NEXT: rldicl r30, r0, 2, 1 -; CHECK-NEXT: addi r0, r30, -4 -; CHECK-NEXT: sldi r28, r3, 3 -; CHECK-NEXT: rldicl r30, r0, 62, 2 -; CHECK-NEXT: add r3, r19, r28 -; CHECK-NEXT: addi r0, r30, 1 -; CHECK-NEXT: add r30, r21, r28 -; CHECK-NEXT: add r28, r22, r28 -; CHECK-NEXT: mtctr r0 -; CHECK-NEXT: add r0, r16, r27 +; CHECK-NEXT: sldi r8, r8, 3 +; CHECK-NEXT: add r30, r5, r10 +; CHECK-NEXT: add r29, r7, r10 +; CHECK-NEXT: add r28, r3, r10 +; CHECK-NEXT: sldi r10, r12, 1 +; CHECK-NEXT: add r8, r5, r8 +; CHECK-NEXT: add r11, r12, r10 +; CHECK-NEXT: add r0, r4, r11 +; CHECK-NEXT: sldi r0, r0, 3 +; CHECK-NEXT: add r27, r5, r0 +; CHECK-NEXT: add r0, r2, r11 +; CHECK-NEXT: add r11, r31, r11 +; CHECK-NEXT: sldi r11, r11, 3 ; CHECK-NEXT: sldi r0, r0, 3 -; CHECK-NEXT: add r25, r21, r0 -; CHECK-NEXT: add r24, r22, r0 -; CHECK-NEXT: add r22, r22, r10 -; CHECK-NEXT: add r21, r21, r10 -; CHECK-NEXT: add r10, r16, r18 -; CHECK-NEXT: add r27, r19, r0 -; CHECK-NEXT: li r0, 0 -; CHECK-NEXT: sldi r18, r16, 5 +; CHECK-NEXT: add r25, r5, r11 +; CHECK-NEXT: add r24, r7, r11 +; CHECK-NEXT: add r23, r3, r11 +; CHECK-NEXT: add r11, r4, r10 +; CHECK-NEXT: add r26, r5, r0 +; CHECK-NEXT: sldi r11, r11, 3 +; CHECK-NEXT: add r22, r5, r11 +; CHECK-NEXT: add r11, r2, r10 +; CHECK-NEXT: add r10, r31, r10 +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: sldi r11, r11, 3 +; CHECK-NEXT: add r20, r5, r10 +; CHECK-NEXT: add r19, r7, r10 +; CHECK-NEXT: add r18, r3, r10 +; CHECK-NEXT: add r10, r12, r4 +; CHECK-NEXT: add r21, r5, r11 +; CHECK-NEXT: sldi r11, r2, 3 ; CHECK-NEXT: sldi r10, r10, 3 -; CHECK-NEXT: add r19, r19, r10 -; CHECK-NEXT: mr r10, r16 +; CHECK-NEXT: add r17, r5, r10 +; CHECK-NEXT: add r10, r12, r2 +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: add r16, r5, r10 +; CHECK-NEXT: add r10, r12, r31 +; CHECK-NEXT: sldi r31, r31, 3 +; CHECK-NEXT: sub r0, r11, r31 +; CHECK-NEXT: sldi r11, r4, 3 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: add r15, r5, r10 +; CHECK-NEXT: add r14, r3, r10 +; CHECK-NEXT: sub r31, r11, r31 +; CHECK-NEXT: add r2, r4, r10 +; CHECK-NEXT: li r11, 0 +; CHECK-NEXT: mr r10, r12 +; CHECK-NEXT: rldicl r7, r7, 2, 1 +; CHECK-NEXT: addi r7, r7, -4 +; CHECK-NEXT: rldicl r7, r7, 62, 2 +; CHECK-NEXT: addi r7, r7, 1 +; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: sldi r7, r12, 5 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_3: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lfdux f0, r21, r18 -; CHECK-NEXT: lfdux f1, r22, r18 -; CHECK-NEXT: ld r15, 88(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: xsmuldp f0, f1, f0 -; CHECK-NEXT: lfd f1, 0(r19) -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfd f0, 0(r19) -; CHECK-NEXT: add r19, r19, r18 -; CHECK-NEXT: lfdx f0, r24, r0 -; CHECK-NEXT: lfdx f1, r25, r0 +; CHECK-NEXT: lfd f0, 0(r14) +; CHECK-NEXT: lfd f1, 0(r2) +; CHECK-NEXT: add r10, r10, r12 +; CHECK-NEXT: add r10, r10, r12 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r27, r0 +; CHECK-NEXT: lfd f1, 0(r15) +; CHECK-NEXT: add r10, r10, r12 +; CHECK-NEXT: add r10, r10, r12 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r27, r0 -; CHECK-NEXT: lfdx f0, r28, r0 -; CHECK-NEXT: lfdx f1, r30, r0 +; CHECK-NEXT: stfd f0, 0(r15) +; CHECK-NEXT: add r15, r15, r7 +; CHECK-NEXT: lfdx f0, r14, r0 +; CHECK-NEXT: lfdx f1, r2, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r3, r0 +; CHECK-NEXT: lfdx f1, r16, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r3, r0 -; CHECK-NEXT: lfdx f0, r4, r0 -; CHECK-NEXT: lfdx f1, r5, r0 +; CHECK-NEXT: stfdx f0, r16, r11 +; CHECK-NEXT: lfdx f0, r14, r31 +; CHECK-NEXT: lfdx f1, r2, r31 +; CHECK-NEXT: add r14, r14, r7 +; CHECK-NEXT: add r2, r2, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r7, r0 +; CHECK-NEXT: lfdx f1, r17, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r7, r0 -; CHECK-NEXT: lfdx f0, r9, r0 -; CHECK-NEXT: lfdx f1, r8, r0 +; CHECK-NEXT: stfdx f0, r17, r11 +; CHECK-NEXT: lfd f0, 0(r18) +; CHECK-NEXT: lfd f1, 0(r19) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r12, r0 +; CHECK-NEXT: lfdx f1, r20, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r12, r0 -; CHECK-NEXT: lfdx f0, r2, r0 -; CHECK-NEXT: lfdx f1, r31, r0 +; CHECK-NEXT: stfdx f0, r20, r11 +; CHECK-NEXT: lfdx f0, r18, r0 +; CHECK-NEXT: lfdx f1, r19, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r14, r0 +; CHECK-NEXT: lfdx f1, r21, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r14, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 96(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 104(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r21, r11 +; CHECK-NEXT: lfdx f0, r18, r31 +; CHECK-NEXT: lfdx f1, r19, r31 +; CHECK-NEXT: add r18, r18, r7 +; CHECK-NEXT: add r19, r19, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r17, r0 +; CHECK-NEXT: lfdx f1, r22, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r17, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 112(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 120(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r22, r11 +; CHECK-NEXT: lfd f0, 0(r23) +; CHECK-NEXT: lfd f1, 0(r24) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r20, r0 +; CHECK-NEXT: lfdx f1, r25, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r20, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 128(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 136(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r25, r11 +; CHECK-NEXT: lfdx f0, r23, r0 +; CHECK-NEXT: lfdx f1, r24, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r23, r0 +; CHECK-NEXT: lfdx f1, r26, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r23, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 144(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 152(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r26, r11 +; CHECK-NEXT: lfdx f0, r23, r31 +; CHECK-NEXT: lfdx f1, r24, r31 +; CHECK-NEXT: add r23, r23, r7 +; CHECK-NEXT: add r24, r24, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r26, r0 +; CHECK-NEXT: lfdx f1, r27, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r26, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 160(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 168(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r27, r11 +; CHECK-NEXT: lfd f0, 0(r28) +; CHECK-NEXT: lfd f1, 0(r29) ; CHECK-NEXT: xsmuldp f0, f0, f1 +; CHECK-NEXT: lfdx f1, r30, r11 +; CHECK-NEXT: xsadddp f0, f1, f0 +; CHECK-NEXT: stfdx f0, r30, r11 +; CHECK-NEXT: lfdx f0, r28, r0 ; CHECK-NEXT: lfdx f1, r29, r0 +; CHECK-NEXT: xsmuldp f0, f0, f1 +; CHECK-NEXT: lfdx f1, r8, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r29, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 176(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 +; CHECK-NEXT: stfdx f0, r8, r11 +; CHECK-NEXT: lfdx f0, r28, r31 +; CHECK-NEXT: lfdx f1, r29, r31 +; CHECK-NEXT: add r28, r28, r7 +; CHECK-NEXT: add r29, r29, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r11, r0 +; CHECK-NEXT: lfdx f1, r9, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r11, r0 -; CHECK-NEXT: add r0, r0, r18 +; CHECK-NEXT: stfdx f0, r9, r11 +; CHECK-NEXT: add r11, r11, r7 ; CHECK-NEXT: bdnz .LBB7_3 ; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: cmpldi r6, 0 ; CHECK-NEXT: beq cr0, .LBB7_7 ; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader -; CHECK-NEXT: ld r12, 64(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r3, 40(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r8, 48(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r30, 80(r1) # 8-byte Folded Reload -; CHECK-NEXT: sldi r4, r16, 3 -; CHECK-NEXT: ld r29, 72(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r28, 56(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r0, r10, r12 -; CHECK-NEXT: add r3, r10, r3 -; CHECK-NEXT: add r8, r10, r8 -; CHECK-NEXT: sub r10, r0, r16 -; CHECK-NEXT: sldi r7, r3, 3 -; CHECK-NEXT: sldi r11, r8, 3 +; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r7, -176(r1) # 8-byte Folded Reload +; CHECK-NEXT: sldi r8, r12, 3 +; CHECK-NEXT: add r0, r10, r0 +; CHECK-NEXT: add r7, r10, r7 ; CHECK-NEXT: sldi r0, r0, 3 -; CHECK-NEXT: sldi r12, r10, 3 -; CHECK-NEXT: add r3, r30, r7 -; CHECK-NEXT: add r5, r29, r7 -; CHECK-NEXT: add r7, r28, r7 -; CHECK-NEXT: add r8, r30, r11 -; CHECK-NEXT: add r9, r29, r11 -; CHECK-NEXT: add r11, r28, r11 -; CHECK-NEXT: add r30, r30, r0 -; CHECK-NEXT: li r0, 0 -; CHECK-NEXT: add r10, r28, r12 -; CHECK-NEXT: add r12, r29, r12 +; CHECK-NEXT: sldi r11, r7, 3 +; CHECK-NEXT: add r30, r5, r0 +; CHECK-NEXT: add r29, r4, r0 +; CHECK-NEXT: add r28, r3, r0 +; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r7, r5, r11 +; CHECK-NEXT: add r9, r4, r11 +; CHECK-NEXT: add r11, r3, r11 +; CHECK-NEXT: add r10, r10, r0 +; CHECK-NEXT: sub r12, r10, r12 +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: sldi r12, r12, 3 +; CHECK-NEXT: add r5, r5, r10 +; CHECK-NEXT: li r10, 0 +; CHECK-NEXT: add r3, r3, r12 +; CHECK-NEXT: add r4, r4, r12 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_6: # %for.body.epil ; CHECK-NEXT: # -; CHECK-NEXT: lfdux f0, r12, r4 -; CHECK-NEXT: lfdux f1, r10, r4 +; CHECK-NEXT: lfdux f0, r4, r8 +; CHECK-NEXT: lfdux f1, r3, r8 ; CHECK-NEXT: addi r6, r6, -1 ; CHECK-NEXT: cmpldi r6, 0 ; CHECK-NEXT: xsmuldp f0, f1, f0 -; CHECK-NEXT: lfd f1, 0(r30) +; CHECK-NEXT: lfd f1, 0(r5) ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfd f0, 0(r30) -; CHECK-NEXT: add r30, r30, r4 -; CHECK-NEXT: lfdx f0, r11, r0 -; CHECK-NEXT: lfdx f1, r9, r0 +; CHECK-NEXT: stfd f0, 0(r5) +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: lfdx f0, r28, r10 +; CHECK-NEXT: lfdx f1, r29, r10 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r8, r0 +; CHECK-NEXT: lfdx f1, r30, r10 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r8, r0 -; CHECK-NEXT: lfdx f0, r7, r0 -; CHECK-NEXT: lfdx f1, r5, r0 +; CHECK-NEXT: stfdx f0, r30, r10 +; CHECK-NEXT: lfdx f0, r11, r10 +; CHECK-NEXT: lfdx f1, r9, r10 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r3, r0 +; CHECK-NEXT: lfdx f1, r7, r10 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r3, r0 -; CHECK-NEXT: add r0, r0, r4 +; CHECK-NEXT: stfdx f0, r7, r10 +; CHECK-NEXT: add r10, r10, r8 ; CHECK-NEXT: bne cr0, .LBB7_6 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup -; CHECK-NEXT: ld r2, 184(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r31, 328(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r30, 320(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r29, 312(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: ld r28, 304(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r27, 296(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r26, 288(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r25, 280(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r24, 272(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r23, 264(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r22, 256(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r21, 248(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r20, 240(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r19, 232(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r18, 224(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r17, 216(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r16, 208(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r15, 200(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r14, 192(r1) # 8-byte Folded Reload -; CHECK-NEXT: addi r1, r1, 336 +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr entry: %cmp49 = icmp sgt i64 %m, 0