diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -186,8 +186,11 @@ /// Check if required PHI node is already exist in Loop \p L. bool alreadyPrepared(Loop *L, Instruction *MemI, const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV, - InstrForm Form); + const SCEV *BasePtrIncSCEV, InstrForm Form); + + /// Get the value which defines the increment SCEV \p BasePtrIncSCEV. + Value *getNodeForInc(Loop *L, Instruction *MemI, + const SCEV *BasePtrIncSCEV); /// Collect condition matched(\p isValidCandidate() returns true) /// candidates in Loop \p L. @@ -514,28 +517,49 @@ if (!SE->isLoopInvariant(BasePtrSCEV->getStart(), L)) return MadeChange; - const SCEVConstant *BasePtrIncSCEV = - dyn_cast(BasePtrSCEV->getStepRecurrence(*SE)); - if (!BasePtrIncSCEV) + bool IsConstantInc = false; + const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE); + Value *IncNode = getNodeForInc(L, MemI, BasePtrIncSCEV); + + const SCEVConstant *BasePtrIncConstantSCEV = + dyn_cast(BasePtrIncSCEV); + if (BasePtrIncConstantSCEV) + IsConstantInc = true; + + // No valid representation for the increment. + if (!IncNode) { + LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n"); return MadeChange; + } + + // Now we only handle update form for constant increment. + // FIXME: add support for non-constant increment UpdateForm. + if (!IsConstantInc && Form == UpdateForm) { + LLVM_DEBUG(dbgs() << "not a constant increment for update form!\n"); + return MadeChange; + } // For some DS form load/store instructions, it can also be an update form, - // if the stride is a multipler of 4. Use update form if prefer it. - bool CanPreInc = (Form == UpdateForm || - ((Form == DSForm) && !BasePtrIncSCEV->getAPInt().urem(4) && - PreferUpdateForm)); + // if the stride is constant and is a multipler of 4. Use update form if + // prefer it. + bool CanPreInc = + (Form == UpdateForm || + ((Form == DSForm) && IsConstantInc && + !BasePtrIncConstantSCEV->getAPInt().urem(4) && PreferUpdateForm)); const SCEV *BasePtrStartSCEV = nullptr; if (CanPreInc) BasePtrStartSCEV = - SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncSCEV); + SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncConstantSCEV); else BasePtrStartSCEV = BasePtrSCEV->getStart(); if (!isSafeToExpand(BasePtrStartSCEV, *SE)) return MadeChange; - if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) + if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) { + LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n"); return MadeChange; + } LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); @@ -564,9 +588,11 @@ Instruction *PtrInc = nullptr; Instruction *NewBasePtr = nullptr; if (CanPreInc) { + assert(BasePtrIncConstantSCEV && + "update form now only supports constant increment."); Instruction *InsPoint = &*Header->getFirstInsertionPt(); PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + I8Ty, NewPHI, BasePtrIncConstantSCEV->getValue(), getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); for (auto PI : predecessors(Header)) { @@ -593,9 +619,8 @@ BasicBlock *BB = PI; Instruction *InsPoint = BB->getTerminator(); PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), - getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); - + I8Ty, NewPHI, IncNode, getInstrName(MemI, GEPNodeIncNameSuffix), + InsPoint); cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); NewPHI->addIncoming(PtrInc, PI); @@ -725,14 +750,90 @@ return MadeChange; } +// Find the loop invariant increment node for SCEV BasePtrIncSCEV. +// bb.loop.preheader: +// %start = ... +// bb.loop.body: +// %phinode = phi [ %start, %bb.loop.preheader ], [ %add, %bb.loop.body ] +// ... +// %add = add %phinode, %inc ; %inc is what we want to get. +// +Value *PPCLoopInstrFormPrep::getNodeForInc(Loop *L, Instruction *MemI, + const SCEV *BasePtrIncSCEV) { + // If the increment is a constant, no definition is needed. + // Return the value directly. + if (isa(BasePtrIncSCEV)) + return cast(BasePtrIncSCEV)->getValue(); + + if (!SE->isLoopInvariant(BasePtrIncSCEV, L)) + return nullptr; + + BasicBlock *BB = MemI->getParent(); + if (!BB) + return nullptr; + + BasicBlock *LatchBB = L->getLoopLatch(); + + if (!LatchBB) + return nullptr; + + // Run through the PHIs and check their operands to find valid representation + // for the increment SCEV. + iterator_range PHIIter = BB->phis(); + for (auto &CurrentPHI : PHIIter) { + PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); + if (!CurrentPHINode) + continue; + + if (!SE->isSCEVable(CurrentPHINode->getType())) + continue; + + const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); + + const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); + if (!PHIBasePtrSCEV) + continue; + + const SCEV *PHIBasePtrIncSCEV = PHIBasePtrSCEV->getStepRecurrence(*SE); + + if (!PHIBasePtrIncSCEV || (PHIBasePtrIncSCEV != BasePtrIncSCEV)) + continue; + + // Get the incoming value from the loop latch and check if the value has + // the add form with the required increment. + if (Instruction *I = dyn_cast( + CurrentPHINode->getIncomingValueForBlock(LatchBB))) { + Value *StrippedBaseI = I; + while (BitCastInst *BC = dyn_cast(StrippedBaseI)) + StrippedBaseI = BC->getOperand(0); + + Instruction *StrippedI = dyn_cast(StrippedBaseI); + if (!StrippedI) + continue; + + // LSR pass may add a getelementptr instruction to do the loop increment, + // also search in that getelementptr instruction. + if (StrippedI->getOpcode() == Instruction::Add || + (StrippedI->getOpcode() == Instruction::GetElementPtr && + StrippedI->getNumOperands() == 2)) { + if (SE->getSCEVAtScope(StrippedI->getOperand(0), L) == BasePtrIncSCEV) + return StrippedI->getOperand(0); + if (SE->getSCEVAtScope(StrippedI->getOperand(1), L) == BasePtrIncSCEV) + return StrippedI->getOperand(1); + } + } + } + return nullptr; +} + // In order to prepare for the preferred instruction form, a PHI is added. // This function will check to see if that PHI already exists and will return // true if it found an existing PHI with the matched start and increment as the // one we wanted to create. bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV, - InstrForm Form) { + const SCEV *BasePtrStartSCEV, + const SCEV *BasePtrIncSCEV, + InstrForm Form) { BasicBlock *BB = MemI->getParent(); if (!BB) return false; diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll @@ -2,9 +2,6 @@ ; RUN: llc -disable-lsr -ppc-asm-full-reg-names -verify-machineinstrs \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s -; FIXME: PPCLoopInstrFormPrep should be able to common base for "(unsigned long long *)(p + j + 5)" -; and "(unsigned long long *)(p + j + 9)", thus we only have two DS form load inside the loop. - ; long long foo(char *p, int n, int count) { ; int j = 0; ; long long sum = 0; @@ -22,31 +19,26 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: addi r4, r4, -1 +; CHECK-NEXT: addi r6, r3, 5 +; CHECK-NEXT: addi r3, r4, -1 ; CHECK-NEXT: extsw r5, r5 -; CHECK-NEXT: li r6, 0 -; CHECK-NEXT: li r7, 5 -; CHECK-NEXT: li r8, 9 -; CHECK-NEXT: clrldi r4, r4, 32 -; CHECK-NEXT: addi r4, r4, 1 -; CHECK-NEXT: mtctr r4 -; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: add r9, r3, r6 +; CHECK-NEXT: ld r4, 0(r6) +; CHECK-NEXT: add r3, r4, r3 +; CHECK-NEXT: ld r4, 4(r6) ; CHECK-NEXT: add r6, r6, r5 -; CHECK-NEXT: ldx r10, r9, r7 -; CHECK-NEXT: ldx r9, r9, r8 -; CHECK-NEXT: add r4, r10, r4 -; CHECK-NEXT: add r4, r4, r9 +; CHECK-NEXT: add r3, r3, r4 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup -; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: li r4, 0 -; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: blr entry: %cmp16 = icmp sgt i32 %n, 0 diff --git a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll --- a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll @@ -6,6 +6,10 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: cmpd 5, 7 +; CHECK-NEXT: std 19, -104(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, -96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, -88(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, -80(1) # 8-byte Folded Spill ; CHECK-NEXT: std 23, -72(1) # 8-byte Folded Spill ; CHECK-NEXT: std 24, -64(1) # 8-byte Folded Spill ; CHECK-NEXT: std 25, -56(1) # 8-byte Folded Spill @@ -17,89 +21,95 @@ ; CHECK-NEXT: bge 0, .LBB0_6 ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: addi 30, 5, 1 -; CHECK-NEXT: addi 29, 5, 3 -; CHECK-NEXT: addi 28, 5, 2 +; CHECK-NEXT: addi 28, 5, 3 +; CHECK-NEXT: addi 27, 5, 2 ; CHECK-NEXT: mulld 12, 8, 5 -; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: addi 29, 3, 16 ; CHECK-NEXT: mulld 0, 9, 8 -; CHECK-NEXT: sldi 11, 10, 3 +; CHECK-NEXT: mr 25, 12 ; CHECK-NEXT: mulld 30, 8, 30 -; CHECK-NEXT: mulld 29, 8, 29 -; CHECK-NEXT: mulld 8, 8, 28 +; CHECK-NEXT: mulld 28, 8, 28 +; CHECK-NEXT: mulld 8, 8, 27 +; CHECK-NEXT: sldi 11, 10, 3 +; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: mr 26, 30 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: add 5, 5, 9 -; CHECK-NEXT: add 12, 12, 0 -; CHECK-NEXT: add 30, 30, 0 -; CHECK-NEXT: add 29, 29, 0 +; CHECK-NEXT: add 25, 25, 0 +; CHECK-NEXT: add 26, 26, 0 +; CHECK-NEXT: add 28, 28, 0 ; CHECK-NEXT: add 8, 8, 0 +; CHECK-NEXT: addi 27, 27, 1 ; CHECK-NEXT: cmpd 5, 7 ; CHECK-NEXT: bge 0, .LBB0_6 ; CHECK-NEXT: .LBB0_3: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_5 Depth 2 -; CHECK-NEXT: sub 28, 5, 10 -; CHECK-NEXT: cmpd 6, 28 +; CHECK-NEXT: sub 24, 5, 10 +; CHECK-NEXT: cmpd 6, 24 ; CHECK-NEXT: bge 0, .LBB0_2 ; CHECK-NEXT: # %bb.4: -; CHECK-NEXT: add 26, 6, 12 -; CHECK-NEXT: add 25, 6, 30 -; CHECK-NEXT: add 24, 6, 29 -; CHECK-NEXT: add 23, 6, 8 -; CHECK-NEXT: sldi 27, 6, 3 -; CHECK-NEXT: sldi 26, 26, 3 -; CHECK-NEXT: sldi 25, 25, 3 -; CHECK-NEXT: sldi 24, 24, 3 -; CHECK-NEXT: sldi 23, 23, 3 -; CHECK-NEXT: add 27, 4, 27 -; CHECK-NEXT: add 26, 3, 26 -; CHECK-NEXT: add 25, 3, 25 -; CHECK-NEXT: add 24, 3, 24 -; CHECK-NEXT: add 23, 3, 23 +; CHECK-NEXT: maddld 19, 0, 27, 30 +; CHECK-NEXT: maddld 20, 0, 27, 12 +; CHECK-NEXT: add 22, 6, 28 +; CHECK-NEXT: add 21, 6, 8 +; CHECK-NEXT: add 20, 6, 20 +; CHECK-NEXT: add 19, 6, 19 +; CHECK-NEXT: sldi 23, 6, 3 +; CHECK-NEXT: sldi 22, 22, 3 +; CHECK-NEXT: sldi 21, 21, 3 +; CHECK-NEXT: add 23, 4, 23 +; CHECK-NEXT: add 22, 29, 22 +; CHECK-NEXT: add 21, 29, 21 +; CHECK-NEXT: sldi 20, 20, 3 +; CHECK-NEXT: sldi 19, 19, 3 +; CHECK-NEXT: add 20, 3, 20 +; CHECK-NEXT: add 19, 3, 19 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_5: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lfd 0, 0(27) -; CHECK-NEXT: lfd 1, -16(26) +; CHECK-NEXT: lfd 0, 0(23) +; CHECK-NEXT: lfd 1, 0(20) ; CHECK-NEXT: add 6, 6, 10 -; CHECK-NEXT: cmpd 6, 28 +; CHECK-NEXT: cmpd 6, 24 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(26) +; CHECK-NEXT: lfd 1, 8(20) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(26) +; CHECK-NEXT: lfd 1, 16(20) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(26) -; CHECK-NEXT: add 26, 26, 11 +; CHECK-NEXT: lfd 1, 24(20) +; CHECK-NEXT: add 20, 20, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(25) +; CHECK-NEXT: lfd 1, 0(19) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(25) +; CHECK-NEXT: lfd 1, 8(19) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(25) +; CHECK-NEXT: lfd 1, 16(19) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(25) -; CHECK-NEXT: add 25, 25, 11 +; CHECK-NEXT: lfd 1, 24(19) +; CHECK-NEXT: add 19, 19, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(23) +; CHECK-NEXT: lfd 1, -16(21) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(23) +; CHECK-NEXT: lfd 1, -8(21) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(23) +; CHECK-NEXT: lfd 1, 0(21) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(23) -; CHECK-NEXT: add 23, 23, 11 +; CHECK-NEXT: lfd 1, 8(21) +; CHECK-NEXT: add 21, 21, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(24) +; CHECK-NEXT: lfd 1, -16(22) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(24) +; CHECK-NEXT: lfd 1, -8(22) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(24) +; CHECK-NEXT: lfd 1, 0(22) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(24) -; CHECK-NEXT: add 24, 24, 11 +; CHECK-NEXT: lfd 1, 8(22) +; CHECK-NEXT: add 22, 22, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: stfd 0, 0(27) -; CHECK-NEXT: add 27, 27, 11 +; CHECK-NEXT: stfd 0, 0(23) +; CHECK-NEXT: add 23, 23, 11 ; CHECK-NEXT: blt 0, .LBB0_5 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_6: @@ -111,6 +121,10 @@ ; CHECK-NEXT: ld 25, -56(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 24, -64(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 23, -72(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 22, -80(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, -88(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, -96(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, -104(1) # 8-byte Folded Reload ; CHECK-NEXT: blr %9 = icmp slt i64 %2, %4 br i1 %9, label %10, label %97