diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -39,6 +39,40 @@ // T *p = array[-1]; // for (int i = 0; i < n; ++i) // *++p = c; +// +// 3: common multiple chains for the load/stores with same offsets in the loop, +// so that we can reuse the offsets and reduce the register pressure in the +// loop. This transformation can also increase the loop ILP as now each chain +// uses its own loop induction add/addi. But this will increase the number of +// add/addi in the loop. +// +// Generically, this means transforming loops like this: +// +// char *p; +// A1 = p + base1 +// A2 = p + base1 + offset +// B1 = p + base2 +// B2 = p + base2 + offset +// +// for (int i = 0; i < n; i++) +// unsigned long x1 = *(unsigned long *)(A1 + i); +// unsigned long x2 = *(unsigned long *)(A2 + i) +// unsigned long x3 = *(unsigned long *)(B1 + i); +// unsigned long x4 = *(unsigned long *)(B2 + i); +// } +// +// to look like this: +// +// A1_new = p + base1 // chain 1 +// B1_new = p + base2 // chain 2, now inside the loop, common offset is +// // reused. +// +// for (long long i = 0; i < n; i+=count) { +// unsigned long x1 = *(unsigned long *)(A1_new + i); +// unsigned long x2 = *(unsigned long *)((A1_new + i) + offset); +// unsigned long x3 = *(unsigned long *)(B1_new + i); +// unsigned long x4 = *(unsigned long *)((B1_new + i) + offset); +// } //===----------------------------------------------------------------------===// #include "PPC.h" @@ -90,6 +124,10 @@ cl::init(true), cl::Hidden, cl::desc("prefer update form when ds form is also a update form")); +static cl::opt EnableChainCommoning( + "ppc-formprep-chain-commoning", cl::init(true), cl::Hidden, + cl::desc("Enable chain commoning in PPC loop prepare pass.")); + // Sum of following 3 per loop thresholds for all loops can not be larger // than MaxVarsPrep. // now the thresholds for each kind prep are exterimental values on Power9. @@ -106,6 +144,16 @@ cl::Hidden, cl::init(8), cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form")); +// Commoning chain will reduce the register pressure, so we don't consider about +// the PHI nodes number. +// But commoning chain will increase the addi/add number in the loop and also +// increase loop ILP. Maximum chain number should be same with hardware +// IssueWidth, because we won't benefit from ILP if the parallel chains number +// is bigger than IssueWidth. We assume there are 2 chains in one bucket, so +// there would be 4 buckets at most on P9(IssueWidth is 8). +static cl::opt MaxVarsChainCommon( + "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4), + cl::desc("Bucket number per loop for PPC loop chain common")); // If would not be profitable if the common base has only one load/store, ISEL // should already be able to choose best load/store form based on offset for @@ -116,12 +164,18 @@ cl::desc("Minimal common base load/store instructions triggering DS/DQ form " "preparation")); +static cl::opt ChainCommonPrepMinThreshold( + "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4), + cl::desc("Minimal common base load/store instructions triggering chain " + "commoning preparation. Must be not smaller than 4")); + STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form"); STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form"); STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form"); STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten"); STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten"); STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten"); +STATISTIC(ChainCommoningRewritten, "Num of commoning chains"); namespace { struct BucketElement { @@ -133,11 +187,24 @@ }; struct Bucket { - Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), - Elements(1, BucketElement(I)) {} + Bucket(const SCEV *B, Instruction *I) + : BaseSCEV(B), Elements(1, BucketElement(I)) { + ChainSize = 0; + } + // The base of the whole bucket. const SCEV *BaseSCEV; + + // All elements in the bucket. In the bucket, the element with the BaseSCEV + // has no offset and all other elements are stored as offsets to the + // BaseSCEV. SmallVector Elements; + + // The potential chains size. This is used for chain commoning only. + unsigned ChainSize; + + // The base for each potential chain. This is used for chain commoning only. + SmallVector ChainBases; }; // "UpdateForm" is not a real PPC instruction form, it stands for dform @@ -193,17 +260,31 @@ Value *getNodeForInc(Loop *L, Instruction *MemI, const SCEV *BasePtrIncSCEV); + /// Common chains to reuse offsets for a loop to reduce register pressure. + bool chainCommoning(Loop *L, SmallVector &Buckets); + + /// Find out the potential commoning chains and their bases. + bool prepareBasesForCommoningChains(Bucket &BucketChain); + + /// Rewrite load/store according to the common chains. + bool + rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket, + SmallSet &BBChanged); + /// Collect condition matched(\p isValidCandidate() returns true) /// candidates in Loop \p L. SmallVector collectCandidates( Loop *L, - std::function + std::function isValidCandidate, + std::function isValidDiff, unsigned MaxCandidateNum); - /// Add a candidate to candidates \p Buckets. + /// Add a candidate to candidates \p Buckets if diff between candidate and + /// one base in \p Buckets matches \p isValidDiff. void addOneCandidate(Instruction *MemI, const SCEV *LSCEV, SmallVector &Buckets, + std::function isValidDiff, unsigned MaxCandidateNum); /// Prepare all candidates in \p Buckets for update form. @@ -335,6 +416,221 @@ return MadeChange; } +// Finding the minimal(chain_number + reusable_offset_number) is a complicated +// algorithmic problem. +// For now, the algorithm used here is simply adjusted to handle the case for +// manually unrolling cases. +// FIXME: use a more powerful algorithm to find minimal sum of chain_number and +// reusable_offset_number for one base with multiple offsets. +bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) { + // The minimal size for profitable chain commoning: + // A1 = base + offset1 + // A2 = base + offset2 (offset2 - offset1 = X) + // A3 = base + offset3 + // A4 = base + offset4 (offset4 - offset3 = X) + // ======> + // base1 = base + offset1 + // base2 = base + offset3 + // A1 = base1 + // A2 = base1 + X + // A3 = base2 + // A4 = base2 + X + // + // There is benefit because of reuse of offest 'X'. + + assert(ChainCommonPrepMinThreshold >= 4 && + "Thredhold can not be smaller than 4!\n"); + if (CBucket.Elements.size() < ChainCommonPrepMinThreshold) + return false; + + // We simply select the FirstOffset as the first reusable offset between each + // chain element 1 and element 0. + const SCEV *FirstOffset = CBucket.Elements[1].Offset; + + // Figure out how many times above FirstOffset is used in the chain. + // For a success commoning chain candidate, offset difference between each + // chain element 1 and element 0 must be also FirstOffset. + unsigned FirstOffsetReusedCount = 1; + + // Figure out how many times above FirstOffset is used in the first chain. + // Chain number is FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain + unsigned FirstOffsetReusedCountInFirstChain = 1; + + unsigned EleNum = CBucket.Elements.size(); + bool SawChainSeparater = false; + for (unsigned j = 2; j != EleNum; ++j) { + if (SE->getMinusSCEV(CBucket.Elements[j].Offset, + CBucket.Elements[j - 1].Offset) == FirstOffset) { + if (!SawChainSeparater) + FirstOffsetReusedCountInFirstChain++; + FirstOffsetReusedCount++; + } else + // For now, if we meet any offset which is not FirstOffset, we assume we + // find a new Chain. + // This makes us miss some opportunities. + // For example, we can common: + // + // {OffsetA, Offset A, OffsetB, OffsetA, OffsetA, OffsetB} + // + // as two chains: + // {{OffsetA, Offset A, OffsetB}, {OffsetA, OffsetA, OffsetB}} + // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 2 + // + // But we fail to common: + // + // {OffsetA, OffsetB, OffsetA, OffsetA, OffsetB, OffsetA} + // FirstOffsetReusedCount = 4; FirstOffsetReusedCountInFirstChain = 1 + + SawChainSeparater = true; + } + + // FirstOffset is not reused, skip this bucket. + if (FirstOffsetReusedCount == 1) + return false; + + unsigned ChainNum = + FirstOffsetReusedCount / FirstOffsetReusedCountInFirstChain; + + // All elements are increased by FirstOffset. + // The number of chains should be sqrt(EleNum). + if (!SawChainSeparater) + ChainNum = (unsigned)sqrt(EleNum); + + CBucket.ChainSize = (unsigned)(EleNum / ChainNum); + + // If this is not a perfect chain(eg: not all elements can be put inside + // commoning chains.), skip now. + if (CBucket.ChainSize * ChainNum != EleNum) + return false; + + if (SawChainSeparater) { + // Check that the offset seqs are the same for all chains. + for (unsigned i = 1; i < CBucket.ChainSize; i++) + for (unsigned j = 1; j < ChainNum; j++) + if (CBucket.Elements[i].Offset != + SE->getMinusSCEV(CBucket.Elements[i + j * CBucket.ChainSize].Offset, + CBucket.Elements[j * CBucket.ChainSize].Offset)) + return false; + } + + for (unsigned i = 0; i < ChainNum; i++) + CBucket.ChainBases.push_back(CBucket.Elements[i * CBucket.ChainSize]); + + LLVM_DEBUG(dbgs() << "Bucket has " << ChainNum << " chains.\n"); + + return true; +} + +bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, + SmallVector &Buckets) { + bool MadeChange = false; + + if (Buckets.empty()) + return MadeChange; + + SmallSet BBChanged; + + for (auto &Bucket : Buckets) { + if (prepareBasesForCommoningChains(Bucket)) + MadeChange |= rewriteLoadStoresForCommoningChains(L, Bucket, BBChanged); + } + + if (MadeChange) + for (auto *BB : BBChanged) + DeleteDeadPHIs(BB); + return MadeChange; +} + +bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( + Loop *L, Bucket &Bucket, SmallSet &BBChanged) { + bool MadeChange = false; + + assert(Bucket.Elements.size() == + Bucket.ChainBases.size() * Bucket.ChainSize && + "invalid bucket for chain commoning!\n"); + SmallPtrSet DeletedPtrs; + + BasicBlock *Header = L->getHeader(); + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + + Type *I64Ty = Type::getInt64Ty(Header->getContext()); + + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), + "loopprepare-chaincommon"); + + for (unsigned ChainIdx = 0; ChainIdx < Bucket.ChainBases.size(); ++ChainIdx) { + unsigned BaseElemIdx = Bucket.ChainSize * ChainIdx; + const SCEV *BaseSCEV = + ChainIdx ? SE->getAddExpr(Bucket.BaseSCEV, + Bucket.Elements[BaseElemIdx].Offset) + : Bucket.BaseSCEV; + const SCEVAddRecExpr *BasePtrSCEV = cast(BaseSCEV); + + // Make sure the base is able to expand. + if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE)) + return MadeChange; + + assert(BasePtrSCEV->isAffine() && + "Invalid SCEV type for the base ptr for a candidate chain!\n"); + + std::pair Base = + rewriteForBase(L, BasePtrSCEV, Bucket.Elements[BaseElemIdx].Instr, + false /* CanPreInc */, UpdateForm, SCEVE, DeletedPtrs); + + if (!Base.first || !Base.second) + return MadeChange; + + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet NewPtrs; + NewPtrs.insert(Base.first); + + for (unsigned Idx = BaseElemIdx + 1; Idx < BaseElemIdx + Bucket.ChainSize; + ++Idx) { + BucketElement &I = Bucket.Elements[Idx]; + Value *Ptr = getPointerOperandAndType(I.Instr); + assert(Ptr && "No pointer operand"); + if (NewPtrs.count(Ptr)) + continue; + + const SCEV *OffsetSCEV = + BaseElemIdx ? SE->getMinusSCEV(Bucket.Elements[Idx].Offset, + Bucket.Elements[BaseElemIdx].Offset) + : Bucket.Elements[Idx].Offset; + + // Make sure offset is able to expand. Only need to check one time as the + // offsets are reused between different chains. + if (!BaseElemIdx) + if (!isSafeToExpand(OffsetSCEV, *SE)) + return false; + + Value *OffsetValue = SCEVE.expandCodeFor( + OffsetSCEV, I64Ty, LoopPredecessor->getTerminator()); + + Instruction *NewPtr = rewriteForBucketElement(Base, Bucket.Elements[Idx], + OffsetValue, DeletedPtrs); + + assert(NewPtr && "Wrong rewrite!\n"); + NewPtrs.insert(NewPtr); + } + + ++ChainCommoningRewritten; + } + + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted below, causing the AssertingVH in the cache to trigger. + SCEVE.clear(); + + for (auto *Ptr : DeletedPtrs) { + if (Instruction *IDel = dyn_cast(Ptr)) + BBChanged.insert(IDel->getParent()); + RecursivelyDeleteTriviallyDeadInstructions(Ptr); + } + + MadeChange = true; + return MadeChange; +} + // Rewrite the new base according to BasePtrSCEV. // bb.loop.preheader: // %newstart = ... @@ -522,35 +818,43 @@ return ReplNewPtr; } -void PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV, - SmallVector &Buckets, - unsigned MaxCandidateNum) { +void PPCLoopInstrFormPrep::addOneCandidate( + Instruction *MemI, const SCEV *LSCEV, SmallVector &Buckets, + std::function isValidDiff, unsigned MaxCandidateNum) { assert((MemI && getPointerOperandAndType(MemI)) && "Candidate should be a memory instruction."); assert(LSCEV && "Invalid SCEV for Ptr value."); + bool FoundBucket = false; for (auto &B : Buckets) { + if (cast(B.BaseSCEV)->getStepRecurrence(*SE) != + cast(LSCEV)->getStepRecurrence(*SE)) + continue; const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); - if (const auto *CDiff = dyn_cast(Diff)) { - B.Elements.push_back(BucketElement(CDiff, MemI)); + if (isValidDiff(Diff)) { + B.Elements.push_back(BucketElement(Diff, MemI)); FoundBucket = true; break; } } if (!FoundBucket) { - if (Buckets.size() == MaxCandidateNum) + if (Buckets.size() == MaxCandidateNum) { + LLVM_DEBUG(dbgs() << "Can not prepare more chains, reach maximum limit " + << MaxCandidateNum << "\n"); return; + } Buckets.push_back(Bucket(LSCEV, MemI)); } } SmallVector PPCLoopInstrFormPrep::collectCandidates( Loop *L, - std::function + std::function isValidCandidate, - unsigned MaxCandidateNum) { + std::function isValidDiff, unsigned MaxCandidateNum) { SmallVector Buckets; + for (const auto &BB : L->blocks()) for (auto &J : *BB) { Value *PtrValue = nullptr; @@ -575,7 +879,7 @@ HasCandidateForPrepare = true; if (isValidCandidate(&J, PtrValue, PointerElementType)) - addOneCandidate(&J, LSCEV, Buckets, MaxCandidateNum); + addOneCandidate(&J, LSCEV, Buckets, isValidDiff, MaxCandidateNum); } return Buckets; } @@ -712,7 +1016,8 @@ SmallPtrSet DeletedPtrs; BasicBlock *Header = L->getHeader(); - SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), + "loopprepare-formrewrite"); // For some DS form load/store instructions, it can also be an update form, // if the stride is constant and is a multipler of 4. Use update form if @@ -990,7 +1295,7 @@ } // Check if a load/store has update form. This lambda is used by function // collectCandidates which can collect candidates for types defined by lambda. - auto isUpdateFormCandidate = [&](const Instruction *I, const Value *PtrValue, + auto isUpdateFormCandidate = [&](const Instruction *I, Value *PtrValue, const Type *PointerElementType) { assert((PtrValue && I) && "Invalid parameter!"); // There are no update forms for Altivec vector load/stores. @@ -1022,7 +1327,7 @@ }; // Check if a load/store has DS form. - auto isDSFormCandidate = [](const Instruction *I, const Value *PtrValue, + auto isDSFormCandidate = [](const Instruction *I, Value *PtrValue, const Type *PointerElementType) { assert((PtrValue && I) && "Invalid parameter!"); if (isa(I)) @@ -1036,7 +1341,7 @@ }; // Check if a load/store has DQ form. - auto isDQFormCandidate = [&](const Instruction *I, const Value *PtrValue, + auto isDQFormCandidate = [&](const Instruction *I, Value *PtrValue, const Type *PointerElementType) { assert((PtrValue && I) && "Invalid parameter!"); // Check if it is a P10 lxvp/stxvp intrinsic. @@ -1048,37 +1353,131 @@ return ST && ST->hasP9Vector() && (PointerElementType->isVectorTy()); }; + // Check if a load/store is candidate for chain commoning. + // If the SCEV is only with one ptr operand in its start, we can use that + // start as a chain separator. Mark this load/store as a candidate. + auto isChainCommoningCandidate = [&](const Instruction *I, Value *PtrValue, + const Type *PointerElementType) { + const SCEVAddRecExpr *ARSCEV = + cast(SE->getSCEVAtScope(PtrValue, L)); + if (!ARSCEV) + return false; + + if (!ARSCEV->isAffine()) + return false; + + const SCEV *Start = ARSCEV->getStart(); + + // A single pointer. We can treat it as offset 0. + if (isa(Start) && Start->getType()->isPointerTy()) + return true; + + const SCEVAddExpr *ASCEV = dyn_cast(Start); + + // We need a SCEVAddExpr to include both base and offset. + if (!ASCEV) + return false; + + // Make sure there is only one pointer operand(base) and all other operands + // are integer type. + bool SawPointer = false; + for (const SCEV *Op : ASCEV->operands()) { + if (Op->getType()->isPointerTy()) { + if (SawPointer) + return false; + SawPointer = true; + } else if (!Op->getType()->isIntegerTy()) + return false; + } + + return SawPointer; + }; + + // Check if the diff is a constant type. This is used for update/DS/DQ form + // preparation. + auto isValidConstantDiff = [](const SCEV *Diff) { + return dyn_cast(Diff) != nullptr; + }; + + // Make sure the diff between the base and new candidate is required type. + // This is used for chain commoning preparation. + auto isValidChainCommoningDiff = [](const SCEV *Diff) { + assert(Diff && "Invalid Diff!\n"); + + // Don't mess up previous dform prepare. + if (isa(Diff)) + return false; + + // A single integer type offset. + if (isa(Diff) && Diff->getType()->isIntegerTy()) + return true; + + const SCEVNAryExpr *ADiff = dyn_cast(Diff); + if (!ADiff) + return false; + + for (const SCEV *Op : ADiff->operands()) + if (!Op->getType()->isIntegerTy()) + return false; + + return true; + }; + HasCandidateForPrepare = false; + LLVM_DEBUG(dbgs() << "Start to prepare for update form.\n"); // Collect buckets of comparable addresses used by loads and stores for update // form. - SmallVector UpdateFormBuckets = - collectCandidates(L, isUpdateFormCandidate, MaxVarsUpdateForm); + SmallVector UpdateFormBuckets = collectCandidates( + L, isUpdateFormCandidate, isValidConstantDiff, MaxVarsUpdateForm); // Prepare for update form. if (!UpdateFormBuckets.empty()) MadeChange |= updateFormPrep(L, UpdateFormBuckets); - else if (!HasCandidateForPrepare) + else if (!HasCandidateForPrepare) { + LLVM_DEBUG( + dbgs() + << "No prepare candidates found, stop praparation for current loop!\n"); // If no candidate for preparing, return early. return MadeChange; + } + LLVM_DEBUG(dbgs() << "Start to prepare for DS form.\n"); // Collect buckets of comparable addresses used by loads and stores for DS // form. - SmallVector DSFormBuckets = - collectCandidates(L, isDSFormCandidate, MaxVarsDSForm); + SmallVector DSFormBuckets = collectCandidates( + L, isDSFormCandidate, isValidConstantDiff, MaxVarsDSForm); // Prepare for DS form. if (!DSFormBuckets.empty()) MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm); + LLVM_DEBUG(dbgs() << "Start to prepare for DQ form.\n"); // Collect buckets of comparable addresses used by loads and stores for DQ // form. - SmallVector DQFormBuckets = - collectCandidates(L, isDQFormCandidate, MaxVarsDQForm); + SmallVector DQFormBuckets = collectCandidates( + L, isDQFormCandidate, isValidConstantDiff, MaxVarsDQForm); // Prepare for DQ form. if (!DQFormBuckets.empty()) MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm); + // Collect buckets of comparable addresses used by loads and stores for chain + // commoning. With chain commoning, we reuse offsets between the chains, so + // the register pressure will be reduced. + if (!EnableChainCommoning) { + LLVM_DEBUG(dbgs() << "Chain commoning is not enabled.\n"); + return MadeChange; + } + + LLVM_DEBUG(dbgs() << "Start to prepare for chain commoning.\n"); + SmallVector Buckets = + collectCandidates(L, isChainCommoningCandidate, isValidChainCommoningDiff, + MaxVarsChainCommon); + + // Prepare for chain commoning. + if (!Buckets.empty()) + MadeChange |= chainCommoning(L, Buckets); + return MadeChange; } diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll --- a/llvm/test/CodeGen/PowerPC/common-chain.ll +++ b/llvm/test/CodeGen/PowerPC/common-chain.ll @@ -38,23 +38,26 @@ ; CHECK-NEXT: cmpdi r6, 1 ; CHECK-NEXT: blt cr0, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r8, r4, 1 +; CHECK-NEXT: sldi r7, r4, 1 ; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: add r8, r4, r7 +; CHECK-NEXT: add r7, r5, r4 +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: add r7, r3, r7 ; CHECK-NEXT: add r5, r3, r5 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: sldi r7, r4, 2 -; CHECK-NEXT: add r9, r4, r8 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: ldx r6, r5, r4 -; CHECK-NEXT: ldx r10, r5, r8 -; CHECK-NEXT: ldx r11, r5, r9 -; CHECK-NEXT: ldx r12, r5, r7 +; CHECK-NEXT: ld r6, 0(r7) +; CHECK-NEXT: ldx r8, r7, r4 +; CHECK-NEXT: ld r9, 0(r5) +; CHECK-NEXT: ldx r10, r5, r4 +; CHECK-NEXT: addi r7, r7, 1 ; CHECK-NEXT: addi r5, r5, 1 -; CHECK-NEXT: mulld r6, r10, r6 -; CHECK-NEXT: mulld r6, r6, r11 -; CHECK-NEXT: maddld r3, r6, r12, r3 +; CHECK-NEXT: mulld r6, r8, r6 +; CHECK-NEXT: mulld r6, r6, r9 +; CHECK-NEXT: maddld r3, r6, r10, r3 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-NEXT: blr @@ -108,8 +111,8 @@ ; 4: + offset ; 5: + offset ; -; It can not be commoned to chains because we will need a chain for a single address, -; which can not make the commoning be profitable. +; It can not be commoned to chains because we need a chain for a single address. +; It is not profitable to common chains if not all addresses are in chains. ; ; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) { ; long long o1 = base1 + offset; @@ -304,7 +307,7 @@ ; 3: + 2*offset ; 4: + 3*offset ; -; The diff between address 2 and address 1 is offset, and this offset is not reused among other addresses, +; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains, ; so we can not common any chains. ; ; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) { @@ -404,7 +407,7 @@ ; 5: + 1*offset ; 6: + 2*offset ; -; The diff between address 2 and address 1 is offset, and this offset is reused between address 4 and address 5. +; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5. ; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6 ; and address 5(2*offset), so we can not common chains for these addresses. ; @@ -564,24 +567,26 @@ ; CHECK-NEXT: cmpdi r6, 1 ; CHECK-NEXT: blt cr0, .LBB5_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: mulli r7, r4, 6 -; CHECK-NEXT: add r5, r3, r5 -; CHECK-NEXT: sldi r3, r4, 1 -; CHECK-NEXT: add r9, r4, r3 -; CHECK-NEXT: mtctr r6 ; CHECK-NEXT: sldi r8, r4, 2 +; CHECK-NEXT: add r7, r5, r4 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: add r7, r3, r7 +; CHECK-NEXT: sldi r4, r4, 1 +; CHECK-NEXT: add r5, r3, r5 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: ldx r6, r5, r4 -; CHECK-NEXT: ldx r10, r5, r9 -; CHECK-NEXT: ldx r11, r5, r8 -; CHECK-NEXT: ldx r12, r5, r7 +; CHECK-NEXT: ld r6, 0(r7) +; CHECK-NEXT: ldx r8, r7, r4 +; CHECK-NEXT: ld r9, 0(r5) +; CHECK-NEXT: ldx r10, r5, r4 +; CHECK-NEXT: addi r7, r7, 1 ; CHECK-NEXT: addi r5, r5, 1 -; CHECK-NEXT: mulld r6, r10, r6 -; CHECK-NEXT: mulld r6, r6, r11 -; CHECK-NEXT: maddld r3, r6, r12, r3 +; CHECK-NEXT: mulld r6, r8, r6 +; CHECK-NEXT: mulld r6, r6, r9 +; CHECK-NEXT: maddld r3, r6, r10, r3 ; CHECK-NEXT: bdnz .LBB5_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-NEXT: blr @@ -664,32 +669,30 @@ ; CHECK-NEXT: cmpdi r7, 1 ; CHECK-NEXT: blt cr0, .LBB6_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r8, r4, 1 -; CHECK-NEXT: mtctr r7 -; CHECK-NEXT: add r9, r4, r8 -; CHECK-NEXT: add r8, r6, r9 ; CHECK-NEXT: add r6, r6, r4 -; CHECK-NEXT: add r9, r5, r9 ; CHECK-NEXT: add r5, r5, r4 -; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: sldi r4, r4, 1 +; CHECK-NEXT: add r5, r3, r5 +; CHECK-NEXT: add r6, r3, r6 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: ldx r7, r3, r5 -; CHECK-NEXT: ldx r10, r3, r9 -; CHECK-NEXT: ldx r11, r3, r6 -; CHECK-NEXT: ldx r12, r3, r8 -; CHECK-NEXT: addi r3, r3, 1 -; CHECK-NEXT: mulld r7, r10, r7 -; CHECK-NEXT: mulld r7, r7, r11 -; CHECK-NEXT: maddld r4, r7, r12, r4 +; CHECK-NEXT: ld r7, 0(r5) +; CHECK-NEXT: ldx r8, r5, r4 +; CHECK-NEXT: ld r9, 0(r6) +; CHECK-NEXT: ldx r10, r6, r4 +; CHECK-NEXT: addi r5, r5, 1 +; CHECK-NEXT: addi r6, r6, 1 +; CHECK-NEXT: mulld r7, r8, r7 +; CHECK-NEXT: mulld r7, r7, r9 +; CHECK-NEXT: maddld r3, r7, r10, r3 ; CHECK-NEXT: bdnz .LBB6_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup -; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB6_4: -; CHECK-NEXT: li r4, 0 -; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: blr entry: %mul = mul nsw i64 %offset, 3 @@ -748,328 +751,272 @@ define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) { ; CHECK-LABEL: spill_reduce_succ: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stdu r1, -336(r1) -; CHECK-NEXT: .cfi_def_cfa_offset 336 -; CHECK-NEXT: .cfi_offset r14, -144 -; CHECK-NEXT: .cfi_offset r15, -136 -; CHECK-NEXT: .cfi_offset r16, -128 -; CHECK-NEXT: .cfi_offset r17, -120 -; CHECK-NEXT: .cfi_offset r18, -112 -; CHECK-NEXT: .cfi_offset r19, -104 -; CHECK-NEXT: .cfi_offset r20, -96 -; CHECK-NEXT: .cfi_offset r21, -88 -; CHECK-NEXT: .cfi_offset r22, -80 -; CHECK-NEXT: .cfi_offset r23, -72 -; CHECK-NEXT: .cfi_offset r24, -64 -; CHECK-NEXT: .cfi_offset r25, -56 -; CHECK-NEXT: .cfi_offset r26, -48 -; CHECK-NEXT: .cfi_offset r27, -40 -; CHECK-NEXT: .cfi_offset r28, -32 -; CHECK-NEXT: .cfi_offset r29, -24 -; CHECK-NEXT: .cfi_offset r30, -16 -; CHECK-NEXT: .cfi_offset r31, -8 -; CHECK-NEXT: .cfi_offset r2, -152 ; CHECK-NEXT: cmpdi r6, 1 -; CHECK-NEXT: std r14, 192(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r15, 200(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r16, 208(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r17, 216(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r18, 224(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r19, 232(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r20, 240(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r21, 248(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r22, 256(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r23, 264(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r24, 272(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r25, 280(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r26, 288(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r27, 296(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r28, 304(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r29, 312(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r30, 320(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r31, 328(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r2, 184(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r9, 40(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r8, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r7, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r5, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r4, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r9, -176(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r8, -168(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r7, -160(r1) # 8-byte Folded Spill ; CHECK-NEXT: blt cr0, .LBB7_7 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r3, r6, 2 -; CHECK-NEXT: li r4, 1 -; CHECK-NEXT: mr r16, r10 -; CHECK-NEXT: cmpdi r3, 1 -; CHECK-NEXT: iselgt r3, r3, r4 -; CHECK-NEXT: addi r4, r3, -1 -; CHECK-NEXT: clrldi r6, r3, 63 -; CHECK-NEXT: cmpldi r4, 3 +; CHECK-NEXT: sldi r6, r6, 2 +; CHECK-NEXT: li r7, 1 +; CHECK-NEXT: mr r12, r10 +; CHECK-NEXT: cmpdi r6, 1 +; CHECK-NEXT: iselgt r7, r6, r7 +; CHECK-NEXT: addi r8, r7, -1 +; CHECK-NEXT: clrldi r6, r7, 63 +; CHECK-NEXT: cmpldi r8, 3 ; CHECK-NEXT: blt cr0, .LBB7_4 ; CHECK-NEXT: # %bb.2: # %for.body.preheader.new -; CHECK-NEXT: ld r30, 40(r1) # 8-byte Folded Reload -; CHECK-NEXT: sldi r4, r16, 2 -; CHECK-NEXT: ld r19, 80(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r21, 72(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r22, 56(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r27, 48(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r18, 64(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r5, r30, r4 -; CHECK-NEXT: rldicl r0, r3, 62, 2 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r11, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 168(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r27, r4 -; CHECK-NEXT: add r4, r18, r4 -; CHECK-NEXT: std r7, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r29, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: add r26, r19, r4 -; CHECK-NEXT: std r5, 152(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r21, r4 -; CHECK-NEXT: add r4, r22, r4 -; CHECK-NEXT: std r7, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r4, 136(r1) # 8-byte Folded Spill -; CHECK-NEXT: sldi r4, r16, 1 -; CHECK-NEXT: std r5, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r10, r16, r4 -; CHECK-NEXT: add r3, r18, r4 -; CHECK-NEXT: add r5, r30, r10 -; CHECK-NEXT: sldi r3, r3, 3 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r23, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r27, r10 -; CHECK-NEXT: std r7, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r20, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r18, r10 -; CHECK-NEXT: std r7, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: sub r10, r18, r10 -; CHECK-NEXT: sldi r5, r5, 3 +; CHECK-NEXT: rldicl r7, r7, 62, 2 +; CHECK-NEXT: sldi r10, r12, 2 +; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r31, -160(r1) # 8-byte Folded Reload +; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill +; CHECK-NEXT: mr r7, r4 +; CHECK-NEXT: ld r4, -176(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r8, r4, r10 +; CHECK-NEXT: sldi r8, r8, 3 +; CHECK-NEXT: add r9, r5, r8 +; CHECK-NEXT: add r8, r2, r10 +; CHECK-NEXT: add r10, r31, r10 ; CHECK-NEXT: sldi r10, r10, 3 -; CHECK-NEXT: add r17, r19, r5 -; CHECK-NEXT: add r7, r21, r5 -; CHECK-NEXT: add r5, r22, r5 -; CHECK-NEXT: std r5, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r5, r30, r4 -; CHECK-NEXT: std r7, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: add r7, r19, r3 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r14, r19, r5 -; CHECK-NEXT: add r31, r21, r5 -; CHECK-NEXT: add r2, r22, r5 -; CHECK-NEXT: add r5, r27, r4 -; CHECK-NEXT: add r4, r22, r3 -; CHECK-NEXT: sldi r5, r5, 3 -; CHECK-NEXT: add r12, r19, r5 -; CHECK-NEXT: add r8, r21, r5 -; CHECK-NEXT: add r9, r22, r5 -; CHECK-NEXT: add r5, r21, r3 -; CHECK-NEXT: add r3, r16, r30 -; CHECK-NEXT: rldicl r30, r0, 2, 1 -; CHECK-NEXT: addi r0, r30, -4 -; CHECK-NEXT: sldi r28, r3, 3 -; CHECK-NEXT: rldicl r30, r0, 62, 2 -; CHECK-NEXT: add r3, r19, r28 -; CHECK-NEXT: addi r0, r30, 1 -; CHECK-NEXT: add r30, r21, r28 -; CHECK-NEXT: add r28, r22, r28 -; CHECK-NEXT: mtctr r0 -; CHECK-NEXT: add r0, r16, r27 +; CHECK-NEXT: sldi r8, r8, 3 +; CHECK-NEXT: add r30, r5, r10 +; CHECK-NEXT: add r29, r7, r10 +; CHECK-NEXT: add r28, r3, r10 +; CHECK-NEXT: sldi r10, r12, 1 +; CHECK-NEXT: add r8, r5, r8 +; CHECK-NEXT: add r11, r12, r10 +; CHECK-NEXT: add r0, r4, r11 +; CHECK-NEXT: sldi r0, r0, 3 +; CHECK-NEXT: add r27, r5, r0 +; CHECK-NEXT: add r0, r2, r11 +; CHECK-NEXT: add r11, r31, r11 +; CHECK-NEXT: sldi r11, r11, 3 ; CHECK-NEXT: sldi r0, r0, 3 -; CHECK-NEXT: add r25, r21, r0 -; CHECK-NEXT: add r24, r22, r0 -; CHECK-NEXT: add r22, r22, r10 -; CHECK-NEXT: add r21, r21, r10 -; CHECK-NEXT: add r10, r16, r18 -; CHECK-NEXT: add r27, r19, r0 -; CHECK-NEXT: li r0, 0 -; CHECK-NEXT: sldi r18, r16, 5 +; CHECK-NEXT: add r25, r5, r11 +; CHECK-NEXT: add r24, r7, r11 +; CHECK-NEXT: add r23, r3, r11 +; CHECK-NEXT: add r11, r4, r10 +; CHECK-NEXT: add r26, r5, r0 +; CHECK-NEXT: sldi r11, r11, 3 +; CHECK-NEXT: add r22, r5, r11 +; CHECK-NEXT: add r11, r2, r10 +; CHECK-NEXT: add r10, r31, r10 +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: sldi r11, r11, 3 +; CHECK-NEXT: add r20, r5, r10 +; CHECK-NEXT: add r19, r7, r10 +; CHECK-NEXT: add r18, r3, r10 +; CHECK-NEXT: add r10, r12, r4 +; CHECK-NEXT: add r21, r5, r11 +; CHECK-NEXT: sldi r11, r2, 3 ; CHECK-NEXT: sldi r10, r10, 3 -; CHECK-NEXT: add r19, r19, r10 -; CHECK-NEXT: mr r10, r16 +; CHECK-NEXT: add r17, r5, r10 +; CHECK-NEXT: add r10, r12, r2 +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: add r16, r5, r10 +; CHECK-NEXT: add r10, r12, r31 +; CHECK-NEXT: sldi r31, r31, 3 +; CHECK-NEXT: sub r0, r11, r31 +; CHECK-NEXT: sldi r11, r4, 3 +; CHECK-NEXT: mr r4, r7 +; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: add r15, r5, r10 +; CHECK-NEXT: add r14, r3, r10 +; CHECK-NEXT: sub r31, r11, r31 +; CHECK-NEXT: add r2, r4, r10 +; CHECK-NEXT: li r11, 0 +; CHECK-NEXT: mr r10, r12 +; CHECK-NEXT: rldicl r7, r7, 2, 1 +; CHECK-NEXT: addi r7, r7, -4 +; CHECK-NEXT: rldicl r7, r7, 62, 2 +; CHECK-NEXT: addi r7, r7, 1 +; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: sldi r7, r12, 5 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_3: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lfdux f0, r21, r18 -; CHECK-NEXT: lfdux f1, r22, r18 -; CHECK-NEXT: ld r15, 88(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: xsmuldp f0, f1, f0 -; CHECK-NEXT: lfd f1, 0(r19) -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: add r10, r10, r16 -; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfd f0, 0(r19) -; CHECK-NEXT: add r19, r19, r18 -; CHECK-NEXT: lfdx f0, r24, r0 -; CHECK-NEXT: lfdx f1, r25, r0 +; CHECK-NEXT: lfd f0, 0(r14) +; CHECK-NEXT: lfd f1, 0(r2) +; CHECK-NEXT: add r10, r10, r12 +; CHECK-NEXT: add r10, r10, r12 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r27, r0 +; CHECK-NEXT: lfd f1, 0(r15) +; CHECK-NEXT: add r10, r10, r12 +; CHECK-NEXT: add r10, r10, r12 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r27, r0 -; CHECK-NEXT: lfdx f0, r28, r0 -; CHECK-NEXT: lfdx f1, r30, r0 +; CHECK-NEXT: stfd f0, 0(r15) +; CHECK-NEXT: add r15, r15, r7 +; CHECK-NEXT: lfdx f0, r14, r0 +; CHECK-NEXT: lfdx f1, r2, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r3, r0 +; CHECK-NEXT: lfdx f1, r16, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r3, r0 -; CHECK-NEXT: lfdx f0, r4, r0 -; CHECK-NEXT: lfdx f1, r5, r0 +; CHECK-NEXT: stfdx f0, r16, r11 +; CHECK-NEXT: lfdx f0, r14, r31 +; CHECK-NEXT: lfdx f1, r2, r31 +; CHECK-NEXT: add r14, r14, r7 +; CHECK-NEXT: add r2, r2, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r7, r0 +; CHECK-NEXT: lfdx f1, r17, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r7, r0 -; CHECK-NEXT: lfdx f0, r9, r0 -; CHECK-NEXT: lfdx f1, r8, r0 +; CHECK-NEXT: stfdx f0, r17, r11 +; CHECK-NEXT: lfd f0, 0(r18) +; CHECK-NEXT: lfd f1, 0(r19) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r12, r0 +; CHECK-NEXT: lfdx f1, r20, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r12, r0 -; CHECK-NEXT: lfdx f0, r2, r0 -; CHECK-NEXT: lfdx f1, r31, r0 +; CHECK-NEXT: stfdx f0, r20, r11 +; CHECK-NEXT: lfdx f0, r18, r0 +; CHECK-NEXT: lfdx f1, r19, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r14, r0 +; CHECK-NEXT: lfdx f1, r21, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r14, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 96(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 104(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r21, r11 +; CHECK-NEXT: lfdx f0, r18, r31 +; CHECK-NEXT: lfdx f1, r19, r31 +; CHECK-NEXT: add r18, r18, r7 +; CHECK-NEXT: add r19, r19, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r17, r0 +; CHECK-NEXT: lfdx f1, r22, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r17, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 112(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 120(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r22, r11 +; CHECK-NEXT: lfd f0, 0(r23) +; CHECK-NEXT: lfd f1, 0(r24) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r20, r0 +; CHECK-NEXT: lfdx f1, r25, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r20, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 128(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 136(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r25, r11 +; CHECK-NEXT: lfdx f0, r23, r0 +; CHECK-NEXT: lfdx f1, r24, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r23, r0 +; CHECK-NEXT: lfdx f1, r26, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r23, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 144(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 152(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r26, r11 +; CHECK-NEXT: lfdx f0, r23, r31 +; CHECK-NEXT: lfdx f1, r24, r31 +; CHECK-NEXT: add r23, r23, r7 +; CHECK-NEXT: add r24, r24, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r26, r0 +; CHECK-NEXT: lfdx f1, r27, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r26, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 160(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 -; CHECK-NEXT: ld r15, 168(r1) # 8-byte Folded Reload +; CHECK-NEXT: stfdx f0, r27, r11 +; CHECK-NEXT: lfd f0, 0(r28) +; CHECK-NEXT: lfd f1, 0(r29) ; CHECK-NEXT: xsmuldp f0, f0, f1 +; CHECK-NEXT: lfdx f1, r30, r11 +; CHECK-NEXT: xsadddp f0, f1, f0 +; CHECK-NEXT: stfdx f0, r30, r11 +; CHECK-NEXT: lfdx f0, r28, r0 ; CHECK-NEXT: lfdx f1, r29, r0 +; CHECK-NEXT: xsmuldp f0, f0, f1 +; CHECK-NEXT: lfdx f1, r8, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r29, r0 -; CHECK-NEXT: lfdx f0, r15, r0 -; CHECK-NEXT: ld r15, 176(r1) # 8-byte Folded Reload -; CHECK-NEXT: lfdx f1, r15, r0 +; CHECK-NEXT: stfdx f0, r8, r11 +; CHECK-NEXT: lfdx f0, r28, r31 +; CHECK-NEXT: lfdx f1, r29, r31 +; CHECK-NEXT: add r28, r28, r7 +; CHECK-NEXT: add r29, r29, r7 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r11, r0 +; CHECK-NEXT: lfdx f1, r9, r11 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r11, r0 -; CHECK-NEXT: add r0, r0, r18 +; CHECK-NEXT: stfdx f0, r9, r11 +; CHECK-NEXT: add r11, r11, r7 ; CHECK-NEXT: bdnz .LBB7_3 ; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: cmpldi r6, 0 ; CHECK-NEXT: beq cr0, .LBB7_7 ; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader -; CHECK-NEXT: ld r12, 64(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r3, 40(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r8, 48(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r30, 80(r1) # 8-byte Folded Reload -; CHECK-NEXT: sldi r4, r16, 3 -; CHECK-NEXT: ld r29, 72(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r28, 56(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r0, r10, r12 -; CHECK-NEXT: add r3, r10, r3 -; CHECK-NEXT: add r8, r10, r8 -; CHECK-NEXT: sub r10, r0, r16 -; CHECK-NEXT: sldi r7, r3, 3 -; CHECK-NEXT: sldi r11, r8, 3 +; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r7, -176(r1) # 8-byte Folded Reload +; CHECK-NEXT: sldi r8, r12, 3 +; CHECK-NEXT: add r0, r10, r0 +; CHECK-NEXT: add r7, r10, r7 ; CHECK-NEXT: sldi r0, r0, 3 -; CHECK-NEXT: sldi r12, r10, 3 -; CHECK-NEXT: add r3, r30, r7 -; CHECK-NEXT: add r5, r29, r7 -; CHECK-NEXT: add r7, r28, r7 -; CHECK-NEXT: add r8, r30, r11 -; CHECK-NEXT: add r9, r29, r11 -; CHECK-NEXT: add r11, r28, r11 -; CHECK-NEXT: add r30, r30, r0 -; CHECK-NEXT: li r0, 0 -; CHECK-NEXT: add r10, r28, r12 -; CHECK-NEXT: add r12, r29, r12 +; CHECK-NEXT: sldi r11, r7, 3 +; CHECK-NEXT: add r30, r5, r0 +; CHECK-NEXT: add r29, r4, r0 +; CHECK-NEXT: add r28, r3, r0 +; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r7, r5, r11 +; CHECK-NEXT: add r9, r4, r11 +; CHECK-NEXT: add r11, r3, r11 +; CHECK-NEXT: add r10, r10, r0 +; CHECK-NEXT: sub r12, r10, r12 +; CHECK-NEXT: sldi r10, r10, 3 +; CHECK-NEXT: sldi r12, r12, 3 +; CHECK-NEXT: add r5, r5, r10 +; CHECK-NEXT: li r10, 0 +; CHECK-NEXT: add r3, r3, r12 +; CHECK-NEXT: add r4, r4, r12 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_6: # %for.body.epil ; CHECK-NEXT: # -; CHECK-NEXT: lfdux f0, r12, r4 -; CHECK-NEXT: lfdux f1, r10, r4 +; CHECK-NEXT: lfdux f0, r4, r8 +; CHECK-NEXT: lfdux f1, r3, r8 ; CHECK-NEXT: addi r6, r6, -1 ; CHECK-NEXT: cmpldi r6, 0 ; CHECK-NEXT: xsmuldp f0, f1, f0 -; CHECK-NEXT: lfd f1, 0(r30) +; CHECK-NEXT: lfd f1, 0(r5) ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfd f0, 0(r30) -; CHECK-NEXT: add r30, r30, r4 -; CHECK-NEXT: lfdx f0, r11, r0 -; CHECK-NEXT: lfdx f1, r9, r0 +; CHECK-NEXT: stfd f0, 0(r5) +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: lfdx f0, r28, r10 +; CHECK-NEXT: lfdx f1, r29, r10 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r8, r0 +; CHECK-NEXT: lfdx f1, r30, r10 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r8, r0 -; CHECK-NEXT: lfdx f0, r7, r0 -; CHECK-NEXT: lfdx f1, r5, r0 +; CHECK-NEXT: stfdx f0, r30, r10 +; CHECK-NEXT: lfdx f0, r11, r10 +; CHECK-NEXT: lfdx f1, r9, r10 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r3, r0 +; CHECK-NEXT: lfdx f1, r7, r10 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r3, r0 -; CHECK-NEXT: add r0, r0, r4 +; CHECK-NEXT: stfdx f0, r7, r10 +; CHECK-NEXT: add r10, r10, r8 ; CHECK-NEXT: bne cr0, .LBB7_6 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup -; CHECK-NEXT: ld r2, 184(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r31, 328(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r30, 320(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r29, 312(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: ld r28, 304(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r27, 296(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r26, 288(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r25, 280(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r24, 272(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r23, 264(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r22, 256(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r21, 248(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r20, 240(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r19, 232(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r18, 224(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r17, 216(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r16, 208(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r15, 200(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r14, 192(r1) # 8-byte Folded Reload -; CHECK-NEXT: addi r1, r1, 336 +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload ; CHECK-NEXT: blr entry: %cmp49 = icmp sgt i64 %m, 0