diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -169,7 +169,7 @@ private: PPCTargetMachine *TM = nullptr; - const PPCSubtarget *ST; + const PPCSubtarget *ST; DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; @@ -184,10 +184,13 @@ bool runOnLoop(Loop *L); /// Check if required PHI node is already exist in Loop \p L. - bool alreadyPrepared(Loop *L, Instruction* MemI, + bool alreadyPrepared(Loop *L, Instruction *MemI, const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV, - InstrForm Form); + const SCEV *BasePtrIncSCEV, InstrForm Form); + + /// Get the value which defines the increment SCEV \p BasePtrIncSCEV. + Value *getPreparedIncNode(Loop *L, Instruction *MemI, + const SCEV *BasePtrIncSCEV); /// Collect condition matched(\p isValidCandidate() returns true) /// candidates in Loop \p L. @@ -266,7 +269,7 @@ if (I->hasName()) return (I->getName() + Suffix).str(); else - return ""; + return ""; } static Value *GetPointerOperand(Value *MemI) { @@ -404,13 +407,13 @@ // contains following load/stores with different remainders: // 1: 10 load/store whose remainder is 1; // 2: 9 load/store whose remainder is 2; - // 3: 1 for remainder 3 and 0 for remainder 0; + // 3: 1 for remainder 3 and 0 for remainder 0; // Now we will choose the first load/store whose remainder is 1 as base and // adjust all other load/stores according to new base, so we will get 10 DS // form and 10 X form. // But we should be more clever, for this case we could use two bases, one for - // remainder 1 and the other for remainder 2, thus we could get 19 DS form and 1 - // X form. + // remainder 1 and the other for remainder 2, thus we could get 19 DS form and + // 1 X form. unsigned MaxCountRemainder = 0; for (unsigned j = 0; j < (unsigned)Form; j++) if ((RemainderOffsetInfo.find(j) != RemainderOffsetInfo.end()) && @@ -515,28 +518,48 @@ if (!SE->isLoopInvariant(BasePtrSCEV->getStart(), L)) return MadeChange; - const SCEVConstant *BasePtrIncSCEV = - dyn_cast(BasePtrSCEV->getStepRecurrence(*SE)); - if (!BasePtrIncSCEV) + bool IsConstantInc = false; + const SCEV *BasePtrIncSCEV = BasePtrSCEV->getStepRecurrence(*SE); + Value *IncNode = getPreparedIncNode(L, MemI, BasePtrIncSCEV); + + const SCEVConstant *BasePtrIncConstantSCEV = + dyn_cast(BasePtrIncSCEV); + if (BasePtrIncConstantSCEV) + IsConstantInc = true; + + // No valid representation for the increment. + if (!IncNode) { + LLVM_DEBUG(dbgs() << "Loop Increasement can not be represented!\n"); return MadeChange; + } + + // Now we only handle update form for constant increment. + // FIXME: add support for non-constant increment UpdateForm. + if (!IsConstantInc && Form == UpdateForm) { + LLVM_DEBUG(dbgs() << "not a constant incresement for update form!\n"); + return MadeChange; + } // For some DS form load/store instructions, it can also be an update form, // if the stride is a multipler of 4. Use update form if prefer it. - bool CanPreInc = (Form == UpdateForm || - ((Form == DSForm) && !BasePtrIncSCEV->getAPInt().urem(4) && - PreferUpdateForm)); + bool CanPreInc = + (Form == UpdateForm || + ((Form == DSForm) && IsConstantInc && + !BasePtrIncConstantSCEV->getAPInt().urem(4) && PreferUpdateForm)); const SCEV *BasePtrStartSCEV = nullptr; if (CanPreInc) BasePtrStartSCEV = - SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncSCEV); + SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncConstantSCEV); else BasePtrStartSCEV = BasePtrSCEV->getStart(); if (!isSafeToExpand(BasePtrStartSCEV, *SE)) return MadeChange; - if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) + if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) { + LLVM_DEBUG(dbgs() << "Instruction form is already prepared!\n"); return MadeChange; + } LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); @@ -565,9 +588,11 @@ Instruction *PtrInc = nullptr; Instruction *NewBasePtr = nullptr; if (CanPreInc) { + assert(BasePtrIncConstantSCEV && + "update form now only supports constant increment."); Instruction *InsPoint = &*Header->getFirstInsertionPt(); PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + I8Ty, NewPHI, BasePtrIncConstantSCEV->getValue(), getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); for (auto PI : predecessors(Header)) { @@ -594,9 +619,8 @@ BasicBlock *BB = PI; Instruction *InsPoint = BB->getTerminator(); PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), - getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); - + I8Ty, NewPHI, IncNode, getInstrName(MemI, GEPNodeIncNameSuffix), + InsPoint); cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); NewPHI->addIncoming(PtrInc, PI); @@ -673,7 +697,7 @@ MadeChange = true; - SuccPrepCount++; + SuccPrepCount++; if (Form == DSForm && !CanPreInc) DSFormChainRewritten++; @@ -726,14 +750,98 @@ return MadeChange; } +Value *PPCLoopInstrFormPrep::getPreparedIncNode(Loop *L, Instruction *MemI, + const SCEV *BasePtrIncSCEV) { + if (isa(BasePtrIncSCEV)) + return cast(BasePtrIncSCEV)->getValue(); + + if (!SE->isLoopInvariant(BasePtrIncSCEV, L)) + return nullptr; + + BasicBlock *BB = MemI->getParent(); + if (!BB) + return nullptr; + + BasicBlock *PredBB = L->getLoopPredecessor(); + BasicBlock *LatchBB = L->getLoopLatch(); + + if (!PredBB || !LatchBB) + return nullptr; + + auto getExistingNode = [&](Instruction *I) -> Value * { + Value *StrippedBasePtr = I; + while (BitCastInst *BC = dyn_cast(StrippedBasePtr)) { + // We only check bitcast instruction with only 1 user here for compiling + // time considering. + if (BC->hasOneUser()) + StrippedBasePtr = *BC->users().begin(); + else + break; + } + + Instruction *StrippedI = dyn_cast(StrippedBasePtr); + if (!StrippedI) + return nullptr; + + // LSR pass may add a getelementptr instruction to do the loop increment, + // also search in that getelementptr instruction. + if (StrippedI->getOpcode() == Instruction::Add || + (StrippedI->getOpcode() == Instruction::GetElementPtr && + StrippedI->getNumOperands() == 2)) { + if (SE->getSCEVAtScope(StrippedI->getOperand(0), L) == BasePtrIncSCEV) + return StrippedI->getOperand(0); + if (SE->getSCEVAtScope(StrippedI->getOperand(1), L) == BasePtrIncSCEV) + return StrippedI->getOperand(1); + return nullptr; + } + return nullptr; + }; + + // Run through the PHIs and check their add users to find valid representation + // for the increment SCEV. + iterator_range PHIIter = BB->phis(); + for (auto &CurrentPHI : PHIIter) { + PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); + if (!CurrentPHINode) + continue; + + if (!SE->isSCEVable(CurrentPHINode->getType())) + continue; + + const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); + + const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); + if (!PHIBasePtrSCEV) + continue; + + const SCEV *PHIBasePtrIncSCEV = PHIBasePtrSCEV->getStepRecurrence(*SE); + if (!PHIBasePtrIncSCEV) + continue; + + if (CurrentPHINode->getNumIncomingValues() == 2) { + if ((CurrentPHINode->getIncomingBlock(0) == LatchBB && + CurrentPHINode->getIncomingBlock(1) == PredBB) || + (CurrentPHINode->getIncomingBlock(1) == LatchBB && + CurrentPHINode->getIncomingBlock(0) == PredBB)) { + if (PHIBasePtrIncSCEV == BasePtrIncSCEV) + for (User *User : CurrentPHINode->users()) + if (Instruction *I = dyn_cast(User)) + if (Value *IncNode = getExistingNode(I)) + return IncNode; + } + } + } + return nullptr; +} + // In order to prepare for the preferred instruction form, a PHI is added. // This function will check to see if that PHI already exists and will return // true if it found an existing PHI with the matched start and increment as the // one we wanted to create. -bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction* MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV, - InstrForm Form) { +bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI, + const SCEV *BasePtrStartSCEV, + const SCEV *BasePtrIncSCEV, + InstrForm Form) { BasicBlock *BB = MemI->getParent(); if (!BB) return false; @@ -777,7 +885,7 @@ PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) { ++PHINodeAlreadyExistsUpdate; return true; - } + } if (Form == DSForm || Form == DQForm) { const SCEVConstant *Diff = dyn_cast( SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV)); @@ -788,7 +896,7 @@ ++PHINodeAlreadyExistsDQ; return true; } - } + } } } } diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-prep-non-const-increasement.ll @@ -2,9 +2,6 @@ ; RUN: llc -disable-lsr -ppc-asm-full-reg-names -verify-machineinstrs \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s -; FIXME: PPCLoopInstrFormPrep should be able to common base for "(unsigned long long *)(p + j + 5)" -; and "(unsigned long long *)(p + j + 9)", thus we only have two DS form load inside the loop. - ; long long foo(char *p, int n, int count) { ; int j = 0; ; long long sum = 0; @@ -22,29 +19,24 @@ ; CHECK-NEXT: cmpwi r4, 1 ; CHECK-NEXT: blt cr0, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: addi r6, r3, 5 +; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: extsw r5, r5 -; CHECK-NEXT: li r6, 0 -; CHECK-NEXT: li r7, 5 -; CHECK-NEXT: mtctr r4 -; CHECK-NEXT: li r8, 9 -; CHECK-NEXT: li r4, 0 +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_2: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: add r9, r3, r6 +; CHECK-NEXT: ld r4, 0(r6) +; CHECK-NEXT: add r3, r4, r3 +; CHECK-NEXT: ld r4, 4(r6) ; CHECK-NEXT: add r6, r6, r5 -; CHECK-NEXT: ldx r10, r9, r7 -; CHECK-NEXT: ldx r9, r9, r8 -; CHECK-NEXT: add r4, r10, r4 -; CHECK-NEXT: add r4, r4, r9 +; CHECK-NEXT: add r3, r3, r4 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup -; CHECK-NEXT: mr r3, r4 ; CHECK-NEXT: blr ; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: li r4, 0 -; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: blr entry: %cmp16 = icmp sgt i32 %n, 0 diff --git a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll --- a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll @@ -6,6 +6,10 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: cmpd 5, 7 +; CHECK-NEXT: std 19, -104(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, -96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, -88(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, -80(1) # 8-byte Folded Spill ; CHECK-NEXT: std 23, -72(1) # 8-byte Folded Spill ; CHECK-NEXT: std 24, -64(1) # 8-byte Folded Spill ; CHECK-NEXT: std 25, -56(1) # 8-byte Folded Spill @@ -17,89 +21,95 @@ ; CHECK-NEXT: bge 0, .LBB0_6 ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: addi 30, 5, 1 -; CHECK-NEXT: addi 29, 5, 3 -; CHECK-NEXT: addi 28, 5, 2 +; CHECK-NEXT: addi 28, 5, 3 +; CHECK-NEXT: addi 27, 5, 2 ; CHECK-NEXT: mulld 12, 8, 5 -; CHECK-NEXT: addi 3, 3, 16 +; CHECK-NEXT: addi 29, 3, 16 ; CHECK-NEXT: mulld 0, 9, 8 -; CHECK-NEXT: sldi 11, 10, 3 +; CHECK-NEXT: mr 25, 12 ; CHECK-NEXT: mulld 30, 8, 30 -; CHECK-NEXT: mulld 29, 8, 29 -; CHECK-NEXT: mulld 8, 8, 28 +; CHECK-NEXT: mulld 28, 8, 28 +; CHECK-NEXT: mulld 8, 8, 27 +; CHECK-NEXT: sldi 11, 10, 3 +; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: mr 26, 30 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: add 5, 5, 9 -; CHECK-NEXT: add 12, 12, 0 -; CHECK-NEXT: add 30, 30, 0 -; CHECK-NEXT: add 29, 29, 0 +; CHECK-NEXT: add 25, 25, 0 +; CHECK-NEXT: add 26, 26, 0 +; CHECK-NEXT: add 28, 28, 0 ; CHECK-NEXT: add 8, 8, 0 +; CHECK-NEXT: addi 27, 27, 1 ; CHECK-NEXT: cmpd 5, 7 ; CHECK-NEXT: bge 0, .LBB0_6 ; CHECK-NEXT: .LBB0_3: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_5 Depth 2 -; CHECK-NEXT: sub 28, 5, 10 -; CHECK-NEXT: cmpd 6, 28 +; CHECK-NEXT: sub 24, 5, 10 +; CHECK-NEXT: cmpd 6, 24 ; CHECK-NEXT: bge 0, .LBB0_2 ; CHECK-NEXT: # %bb.4: -; CHECK-NEXT: add 26, 6, 12 -; CHECK-NEXT: add 25, 6, 30 -; CHECK-NEXT: add 24, 6, 29 -; CHECK-NEXT: add 23, 6, 8 -; CHECK-NEXT: sldi 27, 6, 3 -; CHECK-NEXT: sldi 26, 26, 3 -; CHECK-NEXT: sldi 25, 25, 3 -; CHECK-NEXT: sldi 24, 24, 3 -; CHECK-NEXT: sldi 23, 23, 3 -; CHECK-NEXT: add 27, 4, 27 -; CHECK-NEXT: add 26, 3, 26 -; CHECK-NEXT: add 25, 3, 25 -; CHECK-NEXT: add 24, 3, 24 -; CHECK-NEXT: add 23, 3, 23 +; CHECK-NEXT: maddld 19, 0, 27, 30 +; CHECK-NEXT: maddld 20, 0, 27, 12 +; CHECK-NEXT: add 22, 6, 28 +; CHECK-NEXT: add 21, 6, 8 +; CHECK-NEXT: add 20, 6, 20 +; CHECK-NEXT: add 19, 6, 19 +; CHECK-NEXT: sldi 23, 6, 3 +; CHECK-NEXT: sldi 22, 22, 3 +; CHECK-NEXT: sldi 21, 21, 3 +; CHECK-NEXT: add 23, 4, 23 +; CHECK-NEXT: add 22, 29, 22 +; CHECK-NEXT: add 21, 29, 21 +; CHECK-NEXT: sldi 20, 20, 3 +; CHECK-NEXT: sldi 19, 19, 3 +; CHECK-NEXT: add 20, 3, 20 +; CHECK-NEXT: add 19, 3, 19 ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_5: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lfd 0, 0(27) -; CHECK-NEXT: lfd 1, -16(26) +; CHECK-NEXT: lfd 0, 0(23) +; CHECK-NEXT: lfd 1, 0(20) ; CHECK-NEXT: add 6, 6, 10 -; CHECK-NEXT: cmpd 6, 28 +; CHECK-NEXT: cmpd 6, 24 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(26) +; CHECK-NEXT: lfd 1, 8(20) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(26) +; CHECK-NEXT: lfd 1, 16(20) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(26) -; CHECK-NEXT: add 26, 26, 11 +; CHECK-NEXT: lfd 1, 24(20) +; CHECK-NEXT: add 20, 20, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(25) +; CHECK-NEXT: lfd 1, 0(19) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(25) +; CHECK-NEXT: lfd 1, 8(19) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(25) +; CHECK-NEXT: lfd 1, 16(19) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(25) -; CHECK-NEXT: add 25, 25, 11 +; CHECK-NEXT: lfd 1, 24(19) +; CHECK-NEXT: add 19, 19, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(23) +; CHECK-NEXT: lfd 1, -16(21) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(23) +; CHECK-NEXT: lfd 1, -8(21) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(23) +; CHECK-NEXT: lfd 1, 0(21) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(23) -; CHECK-NEXT: add 23, 23, 11 +; CHECK-NEXT: lfd 1, 8(21) +; CHECK-NEXT: add 21, 21, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -16(24) +; CHECK-NEXT: lfd 1, -16(22) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, -8(24) +; CHECK-NEXT: lfd 1, -8(22) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 0(24) +; CHECK-NEXT: lfd 1, 0(22) ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: lfd 1, 8(24) -; CHECK-NEXT: add 24, 24, 11 +; CHECK-NEXT: lfd 1, 8(22) +; CHECK-NEXT: add 22, 22, 11 ; CHECK-NEXT: xsadddp 0, 0, 1 -; CHECK-NEXT: stfd 0, 0(27) -; CHECK-NEXT: add 27, 27, 11 +; CHECK-NEXT: stfd 0, 0(23) +; CHECK-NEXT: add 23, 23, 11 ; CHECK-NEXT: blt 0, .LBB0_5 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_6: @@ -111,6 +121,10 @@ ; CHECK-NEXT: ld 25, -56(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 24, -64(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 23, -72(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 22, -80(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, -88(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, -96(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, -104(1) # 8-byte Folded Reload ; CHECK-NEXT: blr %9 = icmp slt i64 %2, %4 br i1 %9, label %10, label %97