Index: llvm/lib/Target/PowerPC/CMakeLists.txt =================================================================== --- llvm/lib/Target/PowerPC/CMakeLists.txt +++ llvm/lib/Target/PowerPC/CMakeLists.txt @@ -29,7 +29,7 @@ PPCEarlyReturn.cpp PPCFastISel.cpp PPCFrameLowering.cpp - PPCLoopPreIncPrep.cpp + PPCLoopInstrFormPrep.cpp PPCMCInstLower.cpp PPCMachineFunctionInfo.cpp PPCMachineScheduler.cpp Index: llvm/lib/Target/PowerPC/PPC.h =================================================================== --- llvm/lib/Target/PowerPC/PPC.h +++ llvm/lib/Target/PowerPC/PPC.h @@ -33,7 +33,7 @@ #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); #endif - FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM); + FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM); FunctionPass *createPPCTOCRegDepsPass(); FunctionPass *createPPCEarlyReturnPass(); FunctionPass *createPPCVSXCopyPass(); @@ -59,7 +59,7 @@ #ifndef NDEBUG void initializePPCCTRLoopsVerifyPass(PassRegistry&); #endif - void initializePPCLoopPreIncPrepPass(PassRegistry&); + void initializePPCLoopInstrFormPrepPass(PassRegistry&); void initializePPCTOCRegDepsPass(PassRegistry&); void initializePPCEarlyReturnPass(PassRegistry&); void initializePPCVSXCopyPass(PassRegistry&); Index: llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -0,0 +1,855 @@ +//===------ PPCLoopInstrFormPrep.cpp - Loop Instr form Prep. Pass ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to prepare loops for ppc preferred instruction +// forms. Like DS form, DQ form, D/DS form with update. +// Additional PHIs are created for loop induction variables used by load/store +// instructions so that the preferred form can be used. +// +// 1: For DS/DQ form preparation, transform a reasonable number of load/store to +// DS/DQ form which means displacement of these load/stores must meet +// instruction encoding requirement. That is displacement for DS form must be +// a multiple of 4 and 16 for DQ form. +// Generically, this means transforming loops like this: +// for (int i = 0; i < n; ++i) { +// unsigned long x1 = *(unsigned long *)(p + i + 5); +// unsigned long x2 = *(unsigned long *)(p + i + 9); +// } +// +// to look like this: +// +// unsigned NewP = p + 5; +// for (int i = 0; i < n; ++i) { +// unsigned long x1 = *(unsigned long *)(i + NewP); +// unsigned long x2 = *(unsigned long *)(i + NewP + 4); +// } +// +// 2: For D/DS form with update preparation, it is to prepare loops to +// pre-increment addressing modes. +// Generically, this means transforming loops like this: +// for (int i = 0; i < n; ++i) +// array[i] = c; +// +// to look like this: +// +// T *p = array[-1]; +// for (int i = 0; i < n; ++i) +// *++p = c; +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ppc-loop-instr-form-prep" + +#include "PPC.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include +#include +#include + +using namespace llvm; + +// By default, we limit this to creating 16 PHIs (which is a little over half +// of the allocatable register set). +static cl::opt + MaxVars("ppc-preinc-prep-max-vars", cl::Hidden, cl::init(16), + cl::desc("Potential PHI threshold for PPC preinc loop prep")); + +static cl::opt MinInstNum( + "ppc-inst-form-prep-min-inst-num", cl::Hidden, cl::init(2), + cl::desc("Min number of same base mem instrs for PPC inst form prep")); + +STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form"); +STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form"); +STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form"); +STATISTIC(NumPrepDS, "DS form preparation number for one function"); +STATISTIC(NumPrepDQ, "DQ form preparation number for one function"); +STATISTIC(NumPrepUpdate, "Update form preparation number for one function"); + +namespace { + +struct BucketElement { + BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} + + const SCEVConstant *Offset; + Instruction *Instr; +}; + +struct Bucket { + Bucket(const SCEV *B, Instruction *I) + : BaseSCEV(B), Elements(1, BucketElement(I)) {} + + const SCEV *BaseSCEV; + SmallVector Elements; +}; + +class PPCLoopInstrFormPrep : public FunctionPass { +public: + static char ID; // Pass ID, replacement for typeid + + PPCLoopInstrFormPrep() : FunctionPass(ID) { + initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry()); + } + + PPCLoopInstrFormPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override; + +private: + // "UpdateForm" is not a real PPC instruction form, it stands for dform + // load/store with update like ldu/stdu, or Prefetch intrinsic. + enum { UpdateForm = 1, DSForm, DQForm }; + + PPCTargetMachine *TM = nullptr; + const PPCSubtarget *ST; + DominatorTree *DT; + + LoopInfo *LI; + ScalarEvolution *SE; + bool PreserveLCSSA; + + // Candidate chains for Update form. + SmallVector BucketsUpdate; + // Candidate chains for DS form. + SmallVector BucketsDS; + // Candidate chains for DQ form. + SmallVector BucketsDQ; + + // Need to recollect DS candidates? There are some loads/stores which may + // exist in both BucketsDS and BucketsUpdate, if some load/store is changed + // in one bucket, need to update it in another one. + // To save compiling time, just collect another time for BucketsDS if there + // is any such load/store. + bool recollectBucketsDS; + + bool runOnLoop(Loop *L); + bool alreadyPrepared(Loop *L, Instruction *MemI, const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV, unsigned Form); + bool isUpdateCandidate(const Value *PtrValue, + const SCEVAddRecExpr *LARSCEV) const; + bool isDSCandidate(const Value *PtrValue) const; + bool isDQCandidate(const Value *PtrValue) const; + bool collectCandidates(Loop *L, bool OnlyDS = false); + bool addOneCandidate(Instruction *MemI, const SCEV *LSCEV, + SmallVector &Buckets); + bool LoopPreIncPrep(Loop *L); + bool DSDQFormPrep(Loop *L, unsigned Form); + bool prepareBaseForUpdateFormChain(Loop *L, Bucket &BucketChain); + bool prepareBaseForDSDQFormChain(Loop *L, Bucket &BucketChain, unsigned Form); + bool rewriteLoadStores(Loop *L, Bucket &BucketChain, + SmallSet &BBChanged, unsigned Form); + unsigned getAllChainsSize(void); +}; + +} // end anonymous namespace + +char PPCLoopInstrFormPrep::ID = 0; +static const char *name = "Prepare loop for ppc preferred instruction forms"; +INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false) + +FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) { + return new PPCLoopInstrFormPrep(TM); +} + +static bool IsPtrInBounds(Value *BasePtr) { + Value *StrippedBasePtr = BasePtr; + while (BitCastInst *BC = dyn_cast(StrippedBasePtr)) + StrippedBasePtr = BC->getOperand(0); + if (GetElementPtrInst *GEP = dyn_cast(StrippedBasePtr)) + return GEP->isInBounds(); + + return false; +} + +static Value *GetPointerOperand(Value *MemI) { + if (LoadInst *LMemI = dyn_cast(MemI)) { + return LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast(MemI)) { + return SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) + return IMemI->getArgOperand(0); + } + + return nullptr; +} + +bool PPCLoopInstrFormPrep::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + LI = &getAnalysis().getLoopInfo(); + SE = &getAnalysis().getSE(); + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + ST = TM->getSubtargetImpl(F); + + bool MadeChange = false; + + for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) + for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) + MadeChange |= runOnLoop(*L); + + return MadeChange; +} + +unsigned PPCLoopInstrFormPrep::getAllChainsSize(void) { + return BucketsDS.size() + BucketsDQ.size() + BucketsUpdate.size(); +} + +bool PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV, + SmallVector &Buckets) { + assert((MemI && GetPointerOperand(MemI)) && + "Candidate should be a memory instruction."); + assert(LSCEV && "Invalid SCEV for Ptr value."); + bool FoundBucket = false; + for (auto &B : Buckets) { + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (const auto *CDiff = dyn_cast(Diff)) { + B.Elements.push_back(BucketElement(CDiff, MemI)); + FoundBucket = true; + break; + } + } + + if (!FoundBucket) { + if (getAllChainsSize() == MaxVars) + return false; + Buckets.push_back(Bucket(LSCEV, MemI)); + } + return true; +} + +bool PPCLoopInstrFormPrep::isDSCandidate(const Value *PtrValue) const { + assert(PtrValue && "Invalid parameter!"); + //FIXME: 32 bit instruction lwa is also DS form. + return ((PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) || + (PtrValue->getType()->getPointerElementType()->isFloatTy()) || + (PtrValue->getType()->getPointerElementType()->isDoubleTy())); +} + +bool PPCLoopInstrFormPrep::isDQCandidate(const Value *PtrValue) const { + assert(PtrValue && "Invalid parameter!"); + return ST->hasP9Vector() && + (PtrValue->getType()->getPointerElementType()->isVectorTy()); +} + +bool PPCLoopInstrFormPrep::isUpdateCandidate( + const Value *PtrValue, const SCEVAddRecExpr *LARSCEV) const { + assert((PtrValue && LARSCEV) && "Invalid parameter!"); + if (ST->hasAltivec() && + PtrValue->getType()->getPointerElementType()->isVectorTy()) + return false; + // See getPreIndexedAddressParts, the displacement for LDU/STDU has to + // be 4's multiple (DS-form). For i64 loads/stores when the displacement + // fits in a 16-bit signed field but isn't a multiple of 4, it will be + // useless and possible to break some original well-form addressing mode + // to make this pre-inc prep for it. + if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) { + if (const SCEVConstant *StepConst = + dyn_cast(LARSCEV->getStepRecurrence(*SE))) { + const APInt &ConstInt = StepConst->getValue()->getValue(); + if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) + return false; + } + } + return true; +} + +bool PPCLoopInstrFormPrep::collectCandidates(Loop *L, bool OnlyDS) { + + bool HasCandidate = false; + + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); I != IE; + ++I) { + for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); J != JE; + ++J) { + Value *PtrValue; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast(J)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast(J)) { + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast(J)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { + MemI = IMemI; + PtrValue = IMemI->getArgOperand(0); + } else + continue; + } else + continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); + const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); + if (!LARSCEV || LARSCEV->getLoop() != L) + continue; + + if (!OnlyDS && !isa(J) && isDQCandidate(PtrValue)) + HasCandidate |= addOneCandidate(MemI, LSCEV, BucketsDQ); + else { + if (!OnlyDS && isUpdateCandidate(PtrValue, LARSCEV)) + HasCandidate |= addOneCandidate(MemI, LSCEV, BucketsUpdate); + if (!isa(J) && isDSCandidate(PtrValue)) + HasCandidate |= addOneCandidate(MemI, LSCEV, BucketsDS); + } + } + } + return HasCandidate; +} + +// In order to prepare for the preferred instruction form, a PHI is added. +// This function will check to see if that PHI already exists and will return +// true if it found an existing PHI with the matched start and increment as the +// one we wanted to create. +bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI, + const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV, + unsigned Form) { + BasicBlock *BB = MemI->getParent(); + if (!BB) + return false; + + BasicBlock *PredBB = L->getLoopPredecessor(); + BasicBlock *LatchBB = L->getLoopLatch(); + + if (!PredBB || !LatchBB) + return false; + + // Run through the PHIs and see if we have some that looks like a preparation + iterator_range PHIIter = BB->phis(); + for (auto &CurrentPHI : PHIIter) { + PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); + if (!CurrentPHINode) + continue; + + if (!SE->isSCEVable(CurrentPHINode->getType())) + continue; + + const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); + + const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); + if (!PHIBasePtrSCEV) + continue; + + const SCEVConstant *PHIBasePtrIncSCEV = + dyn_cast(PHIBasePtrSCEV->getStepRecurrence(*SE)); + if (!PHIBasePtrIncSCEV) + continue; + + if (CurrentPHINode->getNumIncomingValues() == 2) { + if ((CurrentPHINode->getIncomingBlock(0) == LatchBB && + CurrentPHINode->getIncomingBlock(1) == PredBB) || + (CurrentPHINode->getIncomingBlock(1) == LatchBB && + CurrentPHINode->getIncomingBlock(0) == PredBB)) { + if (PHIBasePtrIncSCEV == BasePtrIncSCEV) { + // The existing PHI (CurrentPHINode) has the same start and increment + // as the PHI that we wanted to create. + if (Form == UpdateForm && + PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) { + ++PHINodeAlreadyExistsUpdate; + return true; + } else if (Form == DSForm) { + const SCEVConstant *Diff = dyn_cast( + SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV)); + if (Diff && !Diff->getAPInt().urem(4)) { + ++PHINodeAlreadyExistsDS; + return true; + } + } else if (Form == DQForm) { + const SCEVConstant *Diff = dyn_cast( + SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV)); + if (Diff && !Diff->getAPInt().urem(16)) { + ++PHINodeAlreadyExistsDQ; + return true; + } + } + } + } + } + } + return false; +} + +bool PPCLoopInstrFormPrep::prepareBaseForDSDQFormChain(Loop *L, + Bucket &BucketChain, + unsigned Form) { + unsigned DispConstraint = 0; + if (Form == DSForm) + DispConstraint = 4; + else if (Form == DQForm) + DispConstraint = 16; + else + assert(false && "Invalid form"); + // ReminderOffsetInfo details: + // key: value of (Offset urem DispConstraint). For DSForm, it can be [0, 4). + // first of pair: first BucketElement index with a reminder key. for key 0, + // the value of first must be 0. + // second of pair: number of load/stores with the same reminder. + DenseMap> ReminderOffsetInfo; + + for (unsigned j = 0, je = BucketChain.Elements.size(); j != je; ++j) { + if (!BucketChain.Elements[j].Offset) + ReminderOffsetInfo[0] = std::make_pair(0, 1); + else { + unsigned Reminder = + BucketChain.Elements[j].Offset->getAPInt().urem(DispConstraint); + if (ReminderOffsetInfo.find(Reminder) == ReminderOffsetInfo.end()) + ReminderOffsetInfo[Reminder] = std::make_pair(j, 1); + else + ReminderOffsetInfo[Reminder].second++; + } + } + // Find the most profitable base which should have the max number of + // load/store with same reminder. + unsigned MaxCountReminder = 0; + for (unsigned j = 0; j < DispConstraint; j++) + if ((ReminderOffsetInfo.find(j) != ReminderOffsetInfo.end()) && + ReminderOffsetInfo[j].second > + ReminderOffsetInfo[MaxCountReminder].second) + MaxCountReminder = j; + + if (ReminderOffsetInfo[MaxCountReminder].second < MinInstNum) + return false; + + // If the first value is most profitable, nothing needs to update. + if (MaxCountReminder == 0) + return true; + + // Update load/store according to new base. + const SCEV *Offset = + BucketChain.Elements[ReminderOffsetInfo[MaxCountReminder].first].Offset; + BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); + for (auto &E : BucketChain.Elements) { + if (E.Offset) + E.Offset = cast(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast(SE->getNegativeSCEV(Offset)); + } + + std::swap(BucketChain.Elements[ReminderOffsetInfo[MaxCountReminder].first], + BucketChain.Elements[0]); + return true; +} + +// Currently we always choose an exist load/store offset. This maybe lead to +// suboptimal code sequences. For example, for one DS chain with offsets +// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp +// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a +// multipler of 4, it cannot be represented by sint16. Need to improve this. +bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Loop *L, + Bucket &BucketChain) { + // We have a choice now of which instruction's memory operand we use as the + // base for the generated PHI. Always picking the first instruction in each + // bucket does not work well, specifically because that instruction might + // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, + // the choice is somewhat arbitrary, because the backend will happily + // generate direct offsets from both the pre-incremented and + // post-incremented pointer values. Thus, we'll pick the first non-prefetch + // instruction in each bucket, and adjust the recurrence and other offsets + // accordingly. + for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) { + if (auto *II = dyn_cast(BucketChain.Elements[j].Instr)) + if (II->getIntrinsicID() == Intrinsic::prefetch) + continue; + + // If we'd otherwise pick the first element anyway, there's nothing to do. + if (j == 0) + break; + + // If our chosen element has no offset from the base pointer, there's + // nothing to do. + if (!BucketChain.Elements[j].Offset || + BucketChain.Elements[j].Offset->isZero()) + break; + + const SCEV *Offset = BucketChain.Elements[j].Offset; + BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); + for (auto &E : BucketChain.Elements) { + if (E.Offset) + E.Offset = cast(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast(SE->getNegativeSCEV(Offset)); + } + + std::swap(BucketChain.Elements[j], BucketChain.Elements[0]); + break; + } + return true; +} + +bool PPCLoopInstrFormPrep::rewriteLoadStores( + Loop *L, Bucket &BucketChain, SmallSet &BBChanged, + unsigned Form) { + bool MadeChange = false; + const SCEVAddRecExpr *BasePtrSCEV = + cast(BucketChain.BaseSCEV); + if (!BasePtrSCEV->isAffine()) + return MadeChange; + + LLVM_DEBUG(dbgs() << "LIFP: rewrite load/store for form " << Form << "\n"); + LLVM_DEBUG(dbgs() << "LIFP: base: "; BasePtrSCEV->dump()); + + assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); + + // The instruction corresponding to the Bucket's BaseSCEV must be the first + // in the vector of elements. + Instruction *MemI = BucketChain.Elements.begin()->Instr; + Value *BasePtr = GetPointerOperand(MemI); + assert(BasePtr && "No pointer operand"); + + Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); + Type *I8PtrTy = + Type::getInt8PtrTy(MemI->getParent()->getContext(), + BasePtr->getType()->getPointerAddressSpace()); + + const SCEVConstant *BasePtrIncSCEV = + dyn_cast(BasePtrSCEV->getStepRecurrence(*SE)); + if (!BasePtrIncSCEV) + return MadeChange; + const SCEV *BasePtrStartSCEV = nullptr; + if (!SE->isLoopInvariant(BasePtrSCEV->getStart(), L)) + return MadeChange; + bool isUpdateFormProfitable = + (Form == UpdateForm || + ((Form == DSForm) && !BasePtrIncSCEV->getAPInt().urem(4))); + if (isUpdateFormProfitable) + BasePtrStartSCEV = + SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncSCEV); + else + BasePtrStartSCEV = BasePtrSCEV->getStart(); + + if (!isSafeToExpand(BasePtrStartSCEV, *SE)) + return MadeChange; + + if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) + return MadeChange; + + BasicBlock *Header = L->getHeader(); + unsigned HeaderLoopPredCount = pred_size(Header); + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + + PHINode *NewPHI = + PHINode::Create(I8PtrTy, HeaderLoopPredCount, + MemI->hasName() ? MemI->getName() + ".phi" : "", + Header->getFirstNonPHI()); + + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); + Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, + LoopPredecessor->getTerminator()); + LLVM_DEBUG(dbgs() << "LIFP: start value of new base: "; BasePtrStart->dump()); + + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to add it the right number of times. + for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); PI != PE; + ++PI) { + if (*PI != LoopPredecessor) + continue; + + NewPHI->addIncoming(BasePtrStart, LoopPredecessor); + } + + Instruction *PtrInc = nullptr; + Instruction *NewBasePtr; + if (isUpdateFormProfitable) { + Instruction *InsPoint = &*Header->getFirstInsertionPt(); + PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); + cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); + for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); PI != PE; + ++PI) { + if (*PI == LoopPredecessor) + continue; + + NewPHI->addIncoming(PtrInc, *PI); + } + if (PtrInc->getType() != BasePtr->getType()) + NewBasePtr = new BitCastInst( + PtrInc, BasePtr->getType(), + PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint); + else + NewBasePtr = PtrInc; + } else { + for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); PI != PE; + ++PI) { + if (*PI == LoopPredecessor) + continue; + + BasicBlock *BB = *PI; + Instruction *InsPoint = BB->getTerminator(); + PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); + + cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); + + NewPHI->addIncoming(PtrInc, *PI); + } + PtrInc = NewPHI; + if (NewPHI->getType() != BasePtr->getType()) + NewBasePtr = + new BitCastInst(NewPHI, BasePtr->getType(), + NewPHI->hasName() ? NewPHI->getName() + ".cast" : "", + &*Header->getFirstInsertionPt()); + else + NewBasePtr = NewPHI; + } + + if (Instruction *IDel = dyn_cast(BasePtr)) + BBChanged.insert(IDel->getParent()); + BasePtr->replaceAllUsesWith(NewBasePtr); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr); + + // If rewrite for UpdateForm and the load/store is also a candidate in + // BucketsDS, set recollectBucketsDS to true as memory access instruction is + // changed. + if (Form == UpdateForm && isDSCandidate(BasePtr)) + recollectBucketsDS = true; + + MadeChange = true; + + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet NewPtrs; + NewPtrs.insert(NewBasePtr); + + for (auto I = std::next(BucketChain.Elements.begin()), + IE = BucketChain.Elements.end(); + I != IE; ++I) { + Value *Ptr = GetPointerOperand(I->Instr); + assert(Ptr && "No pointer operand"); + if (NewPtrs.count(Ptr)) + continue; + + Instruction *RealNewPtr; + if (!I->Offset || I->Offset->getValue()->isZero()) { + RealNewPtr = NewBasePtr; + } else { + Instruction *PtrIP = dyn_cast(Ptr); + if (PtrIP && isa(NewBasePtr) && + cast(NewBasePtr)->getParent() == PtrIP->getParent()) + PtrIP = nullptr; + else if (isa(PtrIP)) + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); + else if (!PtrIP) + PtrIP = I->Instr; + + GetElementPtrInst *NewPtr = GetElementPtrInst::Create( + I8Ty, PtrInc, I->Offset->getValue(), + I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); + if (!PtrIP) + NewPtr->insertAfter(cast(PtrInc)); + NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); + RealNewPtr = NewPtr; + } + + if (Instruction *IDel = dyn_cast(Ptr)) + BBChanged.insert(IDel->getParent()); + + Instruction *ReplNewPtr; + if (Ptr->getType() != RealNewPtr->getType()) { + ReplNewPtr = + new BitCastInst(RealNewPtr, Ptr->getType(), + Ptr->hasName() ? Ptr->getName() + ".cast" : ""); + ReplNewPtr->insertAfter(RealNewPtr); + } else + ReplNewPtr = RealNewPtr; + + Ptr->replaceAllUsesWith(ReplNewPtr); + RecursivelyDeleteTriviallyDeadInstructions(Ptr); + // If rewrite for UpdateForm and the load/store is also a candidate in + // BucketsDS, set recollectBucketsDS to true as memory access instruction is + // changed. + if (Form == UpdateForm && isDSCandidate(Ptr)) + recollectBucketsDS = true; + + NewPtrs.insert(RealNewPtr); + } + + return MadeChange; +} + +bool PPCLoopInstrFormPrep::LoopPreIncPrep(Loop *L) { + bool MadeChange = false; + if (BucketsUpdate.empty()) + return MadeChange; + SmallSet BBChanged; + bool CurChainChanged = false; + for (unsigned i = 0, e = BucketsUpdate.size(); i != e; ++i) { + // The base address of each bucket is transformed into a phi and the others + // are rewritten based on new base. + if (prepareBaseForUpdateFormChain(L, BucketsUpdate[i])) { + CurChainChanged = + rewriteLoadStores(L, BucketsUpdate[i], BBChanged, UpdateForm); + if (CurChainChanged) + NumPrepUpdate++; + MadeChange |= CurChainChanged; + } + } + if (MadeChange) { + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + if (BBChanged.count(*I)) + DeleteDeadPHIs(*I); + } + } + return MadeChange; +} + +bool PPCLoopInstrFormPrep::DSDQFormPrep(Loop *L, unsigned Form) { + bool MadeChange = false; + + SmallVector *Buckets; + if (Form == DSForm) + Buckets = &BucketsDS; + else if (Form == DQForm) + Buckets = &BucketsDQ; + else + assert(false && "Invalid form."); + + if (Buckets->empty()) + return MadeChange; + + SmallSet BBChanged; + bool CurChainChanged = false; + for (unsigned i = 0, e = Buckets->size(); i != e; ++i) { + if ((*Buckets)[i].Elements.size() < MinInstNum) + continue; + if (prepareBaseForDSDQFormChain(L, (*Buckets)[i], Form)) { + CurChainChanged = rewriteLoadStores(L, (*Buckets)[i], BBChanged, Form); + if (CurChainChanged && Form == DSForm) + NumPrepDS++; + else if (CurChainChanged && Form == DQForm) + NumPrepDQ++; + MadeChange |= CurChainChanged; + } + } + if (MadeChange) { + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + if (BBChanged.count(*I)) + DeleteDeadPHIs(*I); + } + } + return MadeChange; +} + +bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { + bool MadeChange = false; + + // Only prep. the inner-most loop + // FIXME: there should be chances for non inner-most loops. + if (!L->empty()) + return MadeChange; + + LLVM_DEBUG(dbgs() << "LIFP for loop: "; L->dump()); + + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + // If there is no loop predecessor, or the loop predecessor's terminator + // returns a value (which might contribute to determining the loop's + // iteration space), insert a new preheader for the loop. + if (!LoopPredecessor || + !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { + LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); + if (LoopPredecessor) + MadeChange = true; + } + if (!LoopPredecessor) { + LLVM_DEBUG(dbgs() << "LIFP fails since no predecessor for current loop.\n"); + return MadeChange; + } + + BucketsUpdate.clear(); + BucketsDS.clear(); + BucketsDQ.clear(); + recollectBucketsDS = false; + + // Collect buckets of comparable addresses used by loads and stores. + bool hasCandidate = collectCandidates(L); + if (!hasCandidate) { + LLVM_DEBUG(dbgs() << "LIFP fails since no candidate found.\n"); + return MadeChange; + } + LLVM_DEBUG(dbgs() << "BucketsUpdate chain size is " << BucketsUpdate.size() + << "\n"); + LLVM_DEBUG(dbgs() << "BucketsDS chain size is " << BucketsDS.size() << "\n"); + LLVM_DEBUG(dbgs() << "BucketsDQ chain size is " << BucketsDQ.size() << "\n"); + + // Prepare for pre inc form. + if (!BucketsUpdate.empty()) + MadeChange |= LoopPreIncPrep(L); + + // Some candidates in BucketsDS is invalidate due to changing in above + // LoopPreIncPrep transform. + if (recollectBucketsDS) { + BucketsDS.clear(); + hasCandidate = collectCandidates(L, true /*OnlyDS*/); + LLVM_DEBUG(dbgs() << "After recollect, BucketsDS chain size is " + << BucketsDS.size() << "\n"); + } + + // Prepare for DS form load/store. + if (!BucketsDS.empty()) + MadeChange |= DSDQFormPrep(L, DSForm); + + // Prepare for DQ form load/store. + if (!BucketsDQ.empty()) + MadeChange |= DSDQFormPrep(L, DQForm); + + return MadeChange; +} Index: llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ /dev/null @@ -1,525 +0,0 @@ -//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass to prepare loops for pre-increment addressing -// modes. Additional PHIs are created for loop induction variables used by -// load/store instructions so that the pre-increment forms can be used. -// Generically, this means transforming loops like this: -// for (int i = 0; i < n; ++i) -// array[i] = c; -// to look like this: -// T *p = array[-1]; -// for (int i = 0; i < n; ++i) -// *++p = c; -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ppc-loop-preinc-prep" - -#include "PPC.h" -#include "PPCSubtarget.h" -#include "PPCTargetMachine.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Pass.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include -#include -#include - -using namespace llvm; - -// By default, we limit this to creating 16 PHIs (which is a little over half -// of the allocatable register set). -static cl::opt MaxVars("ppc-preinc-prep-max-vars", - cl::Hidden, cl::init(16), - cl::desc("Potential PHI threshold for PPC preinc loop prep")); - -STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form"); - -namespace { - - class PPCLoopPreIncPrep : public FunctionPass { - public: - static char ID; // Pass ID, replacement for typeid - - PPCLoopPreIncPrep() : FunctionPass(ID) { - initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); - } - - PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { - initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - } - - bool alreadyPrepared(Loop *L, Instruction* MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV); - bool runOnFunction(Function &F) override; - - bool runOnLoop(Loop *L); - void simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L); - - private: - PPCTargetMachine *TM = nullptr; - DominatorTree *DT; - LoopInfo *LI; - ScalarEvolution *SE; - bool PreserveLCSSA; - }; - -} // end anonymous namespace - -char PPCLoopPreIncPrep::ID = 0; -static const char *name = "Prepare loop for pre-inc. addressing modes"; -INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) - -FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { - return new PPCLoopPreIncPrep(TM); -} - -namespace { - - struct BucketElement { - BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} - BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} - - const SCEVConstant *Offset; - Instruction *Instr; - }; - - struct Bucket { - Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), - Elements(1, BucketElement(I)) {} - - const SCEV *BaseSCEV; - SmallVector Elements; - }; - -} // end anonymous namespace - -static bool IsPtrInBounds(Value *BasePtr) { - Value *StrippedBasePtr = BasePtr; - while (BitCastInst *BC = dyn_cast(StrippedBasePtr)) - StrippedBasePtr = BC->getOperand(0); - if (GetElementPtrInst *GEP = dyn_cast(StrippedBasePtr)) - return GEP->isInBounds(); - - return false; -} - -static Value *GetPointerOperand(Value *MemI) { - if (LoadInst *LMemI = dyn_cast(MemI)) { - return LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast(MemI)) { - return SMemI->getPointerOperand(); - } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) - return IMemI->getArgOperand(0); - } - - return nullptr; -} - -bool PPCLoopPreIncPrep::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - LI = &getAnalysis().getLoopInfo(); - SE = &getAnalysis().getSE(); - auto *DTWP = getAnalysisIfAvailable(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - - bool MadeChange = false; - - for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) - for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) - MadeChange |= runOnLoop(*L); - - return MadeChange; -} - -// In order to prepare for the pre-increment a PHI is added. -// This function will check to see if that PHI already exists and will return -// true if it found an existing PHI with the same start and increment as the -// one we wanted to create. -bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV) { - BasicBlock *BB = MemI->getParent(); - if (!BB) - return false; - - BasicBlock *PredBB = L->getLoopPredecessor(); - BasicBlock *LatchBB = L->getLoopLatch(); - - if (!PredBB || !LatchBB) - return false; - - // Run through the PHIs and see if we have some that looks like a preparation - iterator_range PHIIter = BB->phis(); - for (auto & CurrentPHI : PHIIter) { - PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); - if (!CurrentPHINode) - continue; - - if (!SE->isSCEVable(CurrentPHINode->getType())) - continue; - - const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); - - const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); - if (!PHIBasePtrSCEV) - continue; - - const SCEVConstant *PHIBasePtrIncSCEV = - dyn_cast(PHIBasePtrSCEV->getStepRecurrence(*SE)); - if (!PHIBasePtrIncSCEV) - continue; - - if (CurrentPHINode->getNumIncomingValues() == 2) { - if ( (CurrentPHINode->getIncomingBlock(0) == LatchBB && - CurrentPHINode->getIncomingBlock(1) == PredBB) || - (CurrentPHINode->getIncomingBlock(1) == LatchBB && - CurrentPHINode->getIncomingBlock(0) == PredBB) ) { - if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV && - PHIBasePtrIncSCEV == BasePtrIncSCEV) { - // The existing PHI (CurrentPHINode) has the same start and increment - // as the PHI that we wanted to create. - ++PHINodeAlreadyExists; - return true; - } - } - } - } - return false; -} - -bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { - bool MadeChange = false; - - // Only prep. the inner-most loop - if (!L->empty()) - return MadeChange; - - LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); - - BasicBlock *Header = L->getHeader(); - - const PPCSubtarget *ST = - TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr; - - unsigned HeaderLoopPredCount = pred_size(Header); - - // Collect buckets of comparable addresses used by loads and stores. - SmallVector Buckets; - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) { - for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); - J != JE; ++J) { - Value *PtrValue; - Instruction *MemI; - - if (LoadInst *LMemI = dyn_cast(J)) { - MemI = LMemI; - PtrValue = LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast(J)) { - MemI = SMemI; - PtrValue = SMemI->getPointerOperand(); - } else if (IntrinsicInst *IMemI = dyn_cast(J)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { - MemI = IMemI; - PtrValue = IMemI->getArgOperand(0); - } else continue; - } else continue; - - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); - if (PtrAddrSpace) - continue; - - // There are no update forms for Altivec vector load/stores. - if (ST && ST->hasAltivec() && - PtrValue->getType()->getPointerElementType()->isVectorTy()) - continue; - - if (L->isLoopInvariant(PtrValue)) - continue; - - const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); - if (const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV)) { - if (LARSCEV->getLoop() != L) - continue; - // See getPreIndexedAddressParts, the displacement for LDU/STDU has to - // be 4's multiple (DS-form). For i64 loads/stores when the displacement - // fits in a 16-bit signed field but isn't a multiple of 4, it will be - // useless and possible to break some original well-form addressing mode - // to make this pre-inc prep for it. - if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) { - if (const SCEVConstant *StepConst = - dyn_cast(LARSCEV->getStepRecurrence(*SE))) { - const APInt &ConstInt = StepConst->getValue()->getValue(); - if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) - continue; - } - } - } else { - continue; - } - - bool FoundBucket = false; - for (auto &B : Buckets) { - const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); - if (const auto *CDiff = dyn_cast(Diff)) { - B.Elements.push_back(BucketElement(CDiff, MemI)); - FoundBucket = true; - break; - } - } - - if (!FoundBucket) { - if (Buckets.size() == MaxVars) - return MadeChange; - Buckets.push_back(Bucket(LSCEV, MemI)); - } - } - } - - if (Buckets.empty()) - return MadeChange; - - BasicBlock *LoopPredecessor = L->getLoopPredecessor(); - // If there is no loop predecessor, or the loop predecessor's terminator - // returns a value (which might contribute to determining the loop's - // iteration space), insert a new preheader for the loop. - if (!LoopPredecessor || - !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { - LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); - if (LoopPredecessor) - MadeChange = true; - } - if (!LoopPredecessor) - return MadeChange; - - LLVM_DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n"); - - SmallSet BBChanged; - for (unsigned i = 0, e = Buckets.size(); i != e; ++i) { - // The base address of each bucket is transformed into a phi and the others - // are rewritten as offsets of that variable. - - // We have a choice now of which instruction's memory operand we use as the - // base for the generated PHI. Always picking the first instruction in each - // bucket does not work well, specifically because that instruction might - // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, - // the choice is somewhat arbitrary, because the backend will happily - // generate direct offsets from both the pre-incremented and - // post-incremented pointer values. Thus, we'll pick the first non-prefetch - // instruction in each bucket, and adjust the recurrence and other offsets - // accordingly. - for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { - if (auto *II = dyn_cast(Buckets[i].Elements[j].Instr)) - if (II->getIntrinsicID() == Intrinsic::prefetch) - continue; - - // If we'd otherwise pick the first element anyway, there's nothing to do. - if (j == 0) - break; - - // If our chosen element has no offset from the base pointer, there's - // nothing to do. - if (!Buckets[i].Elements[j].Offset || - Buckets[i].Elements[j].Offset->isZero()) - break; - - const SCEV *Offset = Buckets[i].Elements[j].Offset; - Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset); - for (auto &E : Buckets[i].Elements) { - if (E.Offset) - E.Offset = cast(SE->getMinusSCEV(E.Offset, Offset)); - else - E.Offset = cast(SE->getNegativeSCEV(Offset)); - } - - std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]); - break; - } - - const SCEVAddRecExpr *BasePtrSCEV = - cast(Buckets[i].BaseSCEV); - if (!BasePtrSCEV->isAffine()) - continue; - - LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); - assert(BasePtrSCEV->getLoop() == L && - "AddRec for the wrong loop?"); - - // The instruction corresponding to the Bucket's BaseSCEV must be the first - // in the vector of elements. - Instruction *MemI = Buckets[i].Elements.begin()->Instr; - Value *BasePtr = GetPointerOperand(MemI); - assert(BasePtr && "No pointer operand"); - - Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); - Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), - BasePtr->getType()->getPointerAddressSpace()); - - const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart(); - if (!SE->isLoopInvariant(BasePtrStartSCEV, L)) - continue; - - const SCEVConstant *BasePtrIncSCEV = - dyn_cast(BasePtrSCEV->getStepRecurrence(*SE)); - if (!BasePtrIncSCEV) - continue; - BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV); - if (!isSafeToExpand(BasePtrStartSCEV, *SE)) - continue; - - LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); - - if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV)) - continue; - - PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, - MemI->hasName() ? MemI->getName() + ".phi" : "", - Header->getFirstNonPHI()); - - SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); - Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, - LoopPredecessor->getTerminator()); - - // Note that LoopPredecessor might occur in the predecessor list multiple - // times, and we need to add it the right number of times. - for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); - PI != PE; ++PI) { - if (*PI != LoopPredecessor) - continue; - - NewPHI->addIncoming(BasePtrStart, LoopPredecessor); - } - - Instruction *InsPoint = &*Header->getFirstInsertionPt(); - GetElementPtrInst *PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), - MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); - PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); - for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); - PI != PE; ++PI) { - if (*PI == LoopPredecessor) - continue; - - NewPHI->addIncoming(PtrInc, *PI); - } - - Instruction *NewBasePtr; - if (PtrInc->getType() != BasePtr->getType()) - NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(), - PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint); - else - NewBasePtr = PtrInc; - - if (Instruction *IDel = dyn_cast(BasePtr)) - BBChanged.insert(IDel->getParent()); - BasePtr->replaceAllUsesWith(NewBasePtr); - RecursivelyDeleteTriviallyDeadInstructions(BasePtr); - - // Keep track of the replacement pointer values we've inserted so that we - // don't generate more pointer values than necessary. - SmallPtrSet NewPtrs; - NewPtrs.insert( NewBasePtr); - - for (auto I = std::next(Buckets[i].Elements.begin()), - IE = Buckets[i].Elements.end(); I != IE; ++I) { - Value *Ptr = GetPointerOperand(I->Instr); - assert(Ptr && "No pointer operand"); - if (NewPtrs.count(Ptr)) - continue; - - Instruction *RealNewPtr; - if (!I->Offset || I->Offset->getValue()->isZero()) { - RealNewPtr = NewBasePtr; - } else { - Instruction *PtrIP = dyn_cast(Ptr); - if (PtrIP && isa(NewBasePtr) && - cast(NewBasePtr)->getParent() == PtrIP->getParent()) - PtrIP = nullptr; - else if (isa(PtrIP)) - PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); - else if (!PtrIP) - PtrIP = I->Instr; - - GetElementPtrInst *NewPtr = GetElementPtrInst::Create( - I8Ty, PtrInc, I->Offset->getValue(), - I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); - if (!PtrIP) - NewPtr->insertAfter(cast(PtrInc)); - NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); - RealNewPtr = NewPtr; - } - - if (Instruction *IDel = dyn_cast(Ptr)) - BBChanged.insert(IDel->getParent()); - - Instruction *ReplNewPtr; - if (Ptr->getType() != RealNewPtr->getType()) { - ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), - Ptr->hasName() ? Ptr->getName() + ".cast" : ""); - ReplNewPtr->insertAfter(RealNewPtr); - } else - ReplNewPtr = RealNewPtr; - - Ptr->replaceAllUsesWith(ReplNewPtr); - RecursivelyDeleteTriviallyDeadInstructions(Ptr); - - NewPtrs.insert(RealNewPtr); - } - - MadeChange = true; - } - - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) { - if (BBChanged.count(*I)) - DeleteDeadPHIs(*I); - } - - return MadeChange; -} Index: llvm/lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -104,7 +104,7 @@ #ifndef NDEBUG initializePPCCTRLoopsVerifyPass(PR); #endif - initializePPCLoopPreIncPrepPass(PR); + initializePPCLoopInstrFormPrepPass(PR); initializePPCTOCRegDepsPass(PR); initializePPCEarlyReturnPass(PR); initializePPCVSXCopyPass(PR); @@ -428,7 +428,7 @@ bool PPCPassConfig::addPreISel() { if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None) - addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); + addPass(createPPCLoopInstrFormPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) addPass(createHardwareLoopsPass()); Index: llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -0,0 +1,733 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s + +; test_no_prep: +; unsigned long test_no_prep(char *p, int count) { +; unsigned long i=0, res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4003; +; int DISP4 = 4004; +; for (; i < count ; i++) { +; unsigned long x1 = *(unsigned long *)(p + i + DISP1); +; unsigned long x2 = *(unsigned long *)(p + i + DISP2); +; unsigned long x3 = *(unsigned long *)(p + i + DISP3); +; unsigned long x4 = *(unsigned long *)(p + i + DISP4); +; res += x1*x2*x3*x4; +; } +; return res + count; +; } + +define i64 @test_no_prep(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_no_prep: +; CHECK: addi 3, 3, 4004 +; CHECK: .LBB0_2: # +; CHECK-NEXT: ldx 9, 3, 6 +; CHECK-NEXT: ldx 10, 3, 7 +; CHECK-NEXT: mulld 9, 10, 9 +; CHECK-NEXT: ldx 11, 3, 8 +; CHECK-NEXT: mulld 9, 9, 11 +; CHECK-NEXT: ld 12, 0(3) +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: maddld 5, 9, 12, 5 +; CHECK-NEXT: bdnz .LBB0_2 +; CHECK: blr + %3 = sext i32 %1 to i64 + %4 = icmp eq i32 %1, 0 + br i1 %4, label %27, label %5 + +5: ; preds = %2, %5 + %6 = phi i64 [ %25, %5 ], [ 0, %2 ] + %7 = phi i64 [ %24, %5 ], [ 0, %2 ] + %8 = getelementptr inbounds i8, i8* %0, i64 %6 + %9 = getelementptr inbounds i8, i8* %8, i64 4001 + %10 = bitcast i8* %9 to i64* + %11 = load i64, i64* %10, align 8 + %12 = getelementptr inbounds i8, i8* %8, i64 4002 + %13 = bitcast i8* %12 to i64* + %14 = load i64, i64* %13, align 8 + %15 = getelementptr inbounds i8, i8* %8, i64 4003 + %16 = bitcast i8* %15 to i64* + %17 = load i64, i64* %16, align 8 + %18 = getelementptr inbounds i8, i8* %8, i64 4004 + %19 = bitcast i8* %18 to i64* + %20 = load i64, i64* %19, align 8 + %21 = mul i64 %14, %11 + %22 = mul i64 %21, %17 + %23 = mul i64 %22, %20 + %24 = add i64 %23, %7 + %25 = add nuw i64 %6, 1 + %26 = icmp ult i64 %25, %3 + br i1 %26, label %5, label %27 + +27: ; preds = %5, %2 + %28 = phi i64 [ 0, %2 ], [ %24, %5 ] + %29 = add i64 %28, %3 + ret i64 %29 +} + +; test_ds_prep: +; unsigned long test_ds_prep(char *p, int count) { +; unsigned long i=0, res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4003; +; int DISP4 = 4006; +; for (; i < count ; i++) { +; unsigned long x1 = *(unsigned long *)(p + i + DISP1); +; unsigned long x2 = *(unsigned long *)(p + i + DISP2); +; unsigned long x3 = *(unsigned long *)(p + i + DISP3); +; unsigned long x4 = *(unsigned long *)(p + i + DISP4); +; res += x1*x2*x3*x4; +; } +; return res + count; +; } + +define i64 @test_ds_prep(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_ds_prep: +; CHECK: addi 6, 3, 4002 +; CHECK: .LBB1_2: # +; CHECK-NEXT: ldx 9, 6, 7 +; CHECK-NEXT: ld 10, 0(6) +; CHECK-NEXT: mulld 9, 10, 9 +; CHECK-NEXT: ldx 11, 6, 5 +; CHECK-NEXT: mulld 9, 9, 11 +; CHECK-NEXT: addi 8, 6, 1 +; CHECK-NEXT: ld 6, 4(6) +; CHECK-NEXT: maddld 3, 9, 6, 3 +; CHECK-NEXT: mr 6, 8 +; CHECK-NEXT: bdnz .LBB1_2 +; CHECK: blr + %3 = sext i32 %1 to i64 + %4 = icmp eq i32 %1, 0 + br i1 %4, label %27, label %5 + +5: ; preds = %2, %5 + %6 = phi i64 [ %25, %5 ], [ 0, %2 ] + %7 = phi i64 [ %24, %5 ], [ 0, %2 ] + %8 = getelementptr inbounds i8, i8* %0, i64 %6 + %9 = getelementptr inbounds i8, i8* %8, i64 4001 + %10 = bitcast i8* %9 to i64* + %11 = load i64, i64* %10, align 8 + %12 = getelementptr inbounds i8, i8* %8, i64 4002 + %13 = bitcast i8* %12 to i64* + %14 = load i64, i64* %13, align 8 + %15 = getelementptr inbounds i8, i8* %8, i64 4003 + %16 = bitcast i8* %15 to i64* + %17 = load i64, i64* %16, align 8 + %18 = getelementptr inbounds i8, i8* %8, i64 4006 + %19 = bitcast i8* %18 to i64* + %20 = load i64, i64* %19, align 8 + %21 = mul i64 %14, %11 + %22 = mul i64 %21, %17 + %23 = mul i64 %22, %20 + %24 = add i64 %23, %7 + %25 = add nuw i64 %6, 1 + %26 = icmp ult i64 %25, %3 + br i1 %26, label %5, label %27 + +27: ; preds = %5, %2 + %28 = phi i64 [ 0, %2 ], [ %24, %5 ] + %29 = add i64 %28, %3 + ret i64 %29 +} + +; test_max_number_reminder: +; unsigned long test_max_number_reminder(char *p, int count) { +; unsigned long i=0, res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4003; +; int DISP4 = 4005; +; int DISP5 = 4006; +; int DISP6 = 4007; +; int DISP7 = 4014; +; int DISP8 = 4010; +; int DISP9 = 4011; +; for (; i < count ; i++) { +; unsigned long x1 = *(unsigned long *)(p + i + DISP1); +; unsigned long x2 = *(unsigned long *)(p + i + DISP2); +; unsigned long x3 = *(unsigned long *)(p + i + DISP3); +; unsigned long x4 = *(unsigned long *)(p + i + DISP4); +; unsigned long x5 = *(unsigned long *)(p + i + DISP5); +; unsigned long x6 = *(unsigned long *)(p + i + DISP6); +; unsigned long x7 = *(unsigned long *)(p + i + DISP7); +; unsigned long x8 = *(unsigned long *)(p + i + DISP8); +; unsigned long x9 = *(unsigned long *)(p + i + DISP9); +; res += x1*x2*x3*x4*x5*x6*x7*x8*x9; +; } +; return res + count; +;} + +define i64 @test_max_number_reminder(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_max_number_reminder: +; CHECK: addi 9, 3, 4002 +; CHECK: .LBB2_2: # +; CHECK-NEXT: ldx 12, 9, 6 +; CHECK-NEXT: ld 0, 0(9) +; CHECK-NEXT: mulld 12, 0, 12 +; CHECK-NEXT: addi 11, 9, 1 +; CHECK-NEXT: ldx 30, 9, 7 +; CHECK-NEXT: ld 29, 4(9) +; CHECK-NEXT: ldx 28, 9, 8 +; CHECK-NEXT: ld 27, 12(9) +; CHECK-NEXT: ld 26, 8(9) +; CHECK-NEXT: ldx 25, 9, 10 +; CHECK-NEXT: ldx 9, 9, 5 +; CHECK-NEXT: mulld 9, 12, 9 +; CHECK-NEXT: mulld 9, 9, 30 +; CHECK-NEXT: mulld 9, 9, 29 +; CHECK-NEXT: mulld 9, 9, 28 +; CHECK-NEXT: mulld 9, 9, 27 +; CHECK-NEXT: mulld 9, 9, 26 +; CHECK-NEXT: maddld 3, 9, 25, 3 +; CHECK-NEXT: mr 9, 11 +; CHECK-NEXT: bdnz .LBB2_2 +; CHECK: blr + %3 = sext i32 %1 to i64 + %4 = icmp eq i32 %1, 0 + br i1 %4, label %47, label %5 + +5: ; preds = %2, %5 + %6 = phi i64 [ %45, %5 ], [ 0, %2 ] + %7 = phi i64 [ %44, %5 ], [ 0, %2 ] + %8 = getelementptr inbounds i8, i8* %0, i64 %6 + %9 = getelementptr inbounds i8, i8* %8, i64 4001 + %10 = bitcast i8* %9 to i64* + %11 = load i64, i64* %10, align 8 + %12 = getelementptr inbounds i8, i8* %8, i64 4002 + %13 = bitcast i8* %12 to i64* + %14 = load i64, i64* %13, align 8 + %15 = getelementptr inbounds i8, i8* %8, i64 4003 + %16 = bitcast i8* %15 to i64* + %17 = load i64, i64* %16, align 8 + %18 = getelementptr inbounds i8, i8* %8, i64 4005 + %19 = bitcast i8* %18 to i64* + %20 = load i64, i64* %19, align 8 + %21 = getelementptr inbounds i8, i8* %8, i64 4006 + %22 = bitcast i8* %21 to i64* + %23 = load i64, i64* %22, align 8 + %24 = getelementptr inbounds i8, i8* %8, i64 4007 + %25 = bitcast i8* %24 to i64* + %26 = load i64, i64* %25, align 8 + %27 = getelementptr inbounds i8, i8* %8, i64 4014 + %28 = bitcast i8* %27 to i64* + %29 = load i64, i64* %28, align 8 + %30 = getelementptr inbounds i8, i8* %8, i64 4010 + %31 = bitcast i8* %30 to i64* + %32 = load i64, i64* %31, align 8 + %33 = getelementptr inbounds i8, i8* %8, i64 4011 + %34 = bitcast i8* %33 to i64* + %35 = load i64, i64* %34, align 8 + %36 = mul i64 %14, %11 + %37 = mul i64 %36, %17 + %38 = mul i64 %37, %20 + %39 = mul i64 %38, %23 + %40 = mul i64 %39, %26 + %41 = mul i64 %40, %29 + %42 = mul i64 %41, %32 + %43 = mul i64 %42, %35 + %44 = add i64 %43, %7 + %45 = add nuw i64 %6, 1 + %46 = icmp ult i64 %45, %3 + br i1 %46, label %5, label %47 + +47: ; preds = %5, %2 + %48 = phi i64 [ 0, %2 ], [ %44, %5 ] + %49 = add i64 %48, %3 + ret i64 %49 +} + +; test_update_ds_prep_recollect: +; unsigned long test_update_ds_prep_recollect(char *p, int count) { +; unsigned long i=0, res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4003; +; int DISP4 = 4006; +; for (; i < count ; i++) { +; unsigned long x1 = *(unsigned long *)(p + 4 * i + DISP1); +; unsigned long x2 = *(unsigned long *)(p + 4 * i + DISP2); +; unsigned long x3 = *(unsigned long *)(p + 4 * i + DISP3); +; unsigned long x4 = *(unsigned long *)(p + 4 * i + DISP4); +; res += x1*x2*x3*x4; +; } +; return res + count; +; } + +define dso_local i64 @test_update_ds_prep_recollect(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_update_ds_prep_recollect: +; CHECK: addi 3, 3, 3998 +; CHECK: .LBB3_2: # +; CHECK-NEXT: ldu 8, 4(3) +; CHECK-NEXT: ldx 9, 3, 7 +; CHECK-NEXT: mulld 8, 8, 9 +; CHECK-NEXT: ldx 10, 3, 6 +; CHECK-NEXT: mulld 8, 8, 10 +; CHECK-NEXT: ld 11, 4(3) +; CHECK-NEXT: maddld 5, 8, 11, 5 +; CHECK-NEXT: bdnz .LBB3_2 +; CHECK: blr + %3 = sext i32 %1 to i64 + %4 = icmp eq i32 %1, 0 + br i1 %4, label %28, label %5 + +5: ; preds = %2, %5 + %6 = phi i64 [ %26, %5 ], [ 0, %2 ] + %7 = phi i64 [ %25, %5 ], [ 0, %2 ] + %8 = shl i64 %6, 2 + %9 = getelementptr inbounds i8, i8* %0, i64 %8 + %10 = getelementptr inbounds i8, i8* %9, i64 4001 + %11 = bitcast i8* %10 to i64* + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds i8, i8* %9, i64 4002 + %14 = bitcast i8* %13 to i64* + %15 = load i64, i64* %14, align 8 + %16 = getelementptr inbounds i8, i8* %9, i64 4003 + %17 = bitcast i8* %16 to i64* + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds i8, i8* %9, i64 4006 + %20 = bitcast i8* %19 to i64* + %21 = load i64, i64* %20, align 8 + %22 = mul i64 %15, %12 + %23 = mul i64 %22, %18 + %24 = mul i64 %23, %21 + %25 = add i64 %24, %7 + %26 = add nuw i64 %6, 1 + %27 = icmp ult i64 %26, %3 + br i1 %27, label %5, label %28 + +28: ; preds = %5, %2 + %29 = phi i64 [ 0, %2 ], [ %25, %5 ] + %30 = add i64 %29, %3 + ret i64 %30 +} + +; test_update_ds_prep_norecollect: +; unsigned long test_update_ds_prep_norecollect(char *p, int count) { +; unsigned long i=0, res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4003; +; int DISP4 = 4007; +; for (; i < count ; i++) { +; char x1 = *(p + i + DISP1); +; unsigned long x2 = *(unsigned long *)(p + i + DISP2); +; unsigned long x3 = *(unsigned long *)(p + i + DISP3); +; unsigned long x4 = *(unsigned long *)(p + i + DISP4); +; res += (unsigned long)x1*x2*x3*x4; +; } +; return res + count; +; } + +define i64 @test_update_ds_prep_norecollect(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_update_ds_prep_norecollect: +; CHECK: addi 5, 3, 4000 +; CHECK: addi 3, 3, 4003 +; CHECK: .LBB4_2: # +; CHECK-NEXT: lbzu 8, 1(5) +; CHECK-NEXT: ldx 9, 3, 7 +; CHECK-NEXT: ld 10, 0(3) +; CHECK-NEXT: ld 11, 4(3) +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: mulld 8, 9, 8 +; CHECK-NEXT: mulld 8, 8, 10 +; CHECK-NEXT: maddld 6, 8, 11, 6 +; CHECK-NEXT: bdnz .LBB4_2 +; CHECK: blr + %3 = sext i32 %1 to i64 + %4 = icmp eq i32 %1, 0 + br i1 %4, label %27, label %5 + +5: ; preds = %2, %5 + %6 = phi i64 [ %25, %5 ], [ 0, %2 ] + %7 = phi i64 [ %24, %5 ], [ 0, %2 ] + %8 = getelementptr inbounds i8, i8* %0, i64 %6 + %9 = getelementptr inbounds i8, i8* %8, i64 4001 + %10 = load i8, i8* %9, align 1 + %11 = getelementptr inbounds i8, i8* %8, i64 4002 + %12 = bitcast i8* %11 to i64* + %13 = load i64, i64* %12, align 8 + %14 = getelementptr inbounds i8, i8* %8, i64 4003 + %15 = bitcast i8* %14 to i64* + %16 = load i64, i64* %15, align 8 + %17 = getelementptr inbounds i8, i8* %8, i64 4007 + %18 = bitcast i8* %17 to i64* + %19 = load i64, i64* %18, align 8 + %20 = zext i8 %10 to i64 + %21 = mul i64 %13, %20 + %22 = mul i64 %21, %16 + %23 = mul i64 %22, %19 + %24 = add i64 %23, %7 + %25 = add nuw i64 %6, 1 + %26 = icmp ult i64 %25, %3 + br i1 %26, label %5, label %27 + +27: ; preds = %5, %2 + %28 = phi i64 [ 0, %2 ], [ %24, %5 ] + %29 = add i64 %28, %3 + ret i64 %29 +} + +; test_ds_multiple_chains: +; unsigned long test_ds_multiple_chains(char *p, char *q, int count) { +; unsigned long i=0, res=0; +; int DISP1 = 4001; +; int DISP2 = 4010; +; int DISP3 = 4005; +; int DISP4 = 4009; +; for (; i < count ; i++) { +; unsigned long x1 = *(unsigned long *)(p + i + DISP1); +; unsigned long x2 = *(unsigned long *)(p + i + DISP2); +; unsigned long x3 = *(unsigned long *)(p + i + DISP3); +; unsigned long x4 = *(unsigned long *)(p + i + DISP4); +; unsigned long x5 = *(unsigned long *)(q + i + DISP1); +; unsigned long x6 = *(unsigned long *)(q + i + DISP2); +; unsigned long x7 = *(unsigned long *)(q + i + DISP3); +; unsigned long x8 = *(unsigned long *)(q + i + DISP4); +; res += x1*x2*x3*x4*x5*x6*x7*x8; +; } +; return res + count; +; } + +define dso_local i64 @test_ds_multiple_chains(i8* %0, i8* %1, i32 signext %2) { +; CHECK-LABEL: test_ds_multiple_chains: +; CHECK: addi 3, 3, 4001 +; CHECK: addi 4, 4, 4001 +; CHECK: .LBB5_2: # +; CHECK-NEXT: ld 8, 0(3) +; CHECK-NEXT: ldx 9, 3, 7 +; CHECK-NEXT: mulld 8, 9, 8 +; CHECK-NEXT: ld 9, 4(3) +; CHECK-NEXT: mulld 8, 8, 9 +; CHECK-NEXT: ld 10, 8(3) +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: mulld 8, 8, 10 +; CHECK-NEXT: ld 11, 0(4) +; CHECK-NEXT: mulld 8, 8, 11 +; CHECK-NEXT: ldx 12, 4, 7 +; CHECK-NEXT: mulld 8, 8, 12 +; CHECK-NEXT: ld 0, 4(4) +; CHECK-NEXT: mulld 8, 8, 0 +; CHECK-NEXT: ld 30, 8(4) +; CHECK-NEXT: addi 4, 4, 1 +; CHECK-NEXT: maddld 6, 8, 30, 6 +; CHECK-NEXT: bdnz .LBB5_2 +; CHECK: blr + %4 = sext i32 %2 to i64 + %5 = icmp eq i32 %2, 0 + br i1 %5, label %45, label %6 + +6: ; preds = %3, %6 + %7 = phi i64 [ %43, %6 ], [ 0, %3 ] + %8 = phi i64 [ %42, %6 ], [ 0, %3 ] + %9 = getelementptr inbounds i8, i8* %0, i64 %7 + %10 = getelementptr inbounds i8, i8* %9, i64 4001 + %11 = bitcast i8* %10 to i64* + %12 = load i64, i64* %11, align 8 + %13 = getelementptr inbounds i8, i8* %9, i64 4010 + %14 = bitcast i8* %13 to i64* + %15 = load i64, i64* %14, align 8 + %16 = getelementptr inbounds i8, i8* %9, i64 4005 + %17 = bitcast i8* %16 to i64* + %18 = load i64, i64* %17, align 8 + %19 = getelementptr inbounds i8, i8* %9, i64 4009 + %20 = bitcast i8* %19 to i64* + %21 = load i64, i64* %20, align 8 + %22 = getelementptr inbounds i8, i8* %1, i64 %7 + %23 = getelementptr inbounds i8, i8* %22, i64 4001 + %24 = bitcast i8* %23 to i64* + %25 = load i64, i64* %24, align 8 + %26 = getelementptr inbounds i8, i8* %22, i64 4010 + %27 = bitcast i8* %26 to i64* + %28 = load i64, i64* %27, align 8 + %29 = getelementptr inbounds i8, i8* %22, i64 4005 + %30 = bitcast i8* %29 to i64* + %31 = load i64, i64* %30, align 8 + %32 = getelementptr inbounds i8, i8* %22, i64 4009 + %33 = bitcast i8* %32 to i64* + %34 = load i64, i64* %33, align 8 + %35 = mul i64 %15, %12 + %36 = mul i64 %35, %18 + %37 = mul i64 %36, %21 + %38 = mul i64 %37, %25 + %39 = mul i64 %38, %28 + %40 = mul i64 %39, %31 + %41 = mul i64 %40, %34 + %42 = add i64 %41, %8 + %43 = add nuw i64 %7, 1 + %44 = icmp ult i64 %43, %4 + br i1 %44, label %6, label %45 + +45: ; preds = %6, %3 + %46 = phi i64 [ 0, %3 ], [ %42, %6 ] + %47 = add i64 %46, %4 + ret i64 %47 +} + +; test_ds_cross_basic_blocks: +;extern char *arr; +;unsigned long foo(char *p, int count) +;{ +; unsigned long i=0, res=0; +; int DISP1 = 4000; +; int DISP2 = 4001; +; int DISP3 = 4002; +; int DISP4 = 4003; +; int DISP5 = 4005; +; int DISP6 = 4009; +; unsigned long x1, x2, x3, x4, x5, x6; +; x1=x2=x3=x4=x5=x6=1; +; for (; i < count ; i++) { +; if (arr[i] % 3 == 1) { +; x1 += *(unsigned long *)(p + i + DISP1); +; x2 += *(unsigned long *)(p + i + DISP2); +; } +; else if (arr[i] % 3 == 2) { +; x3 += *(unsigned long *)(p + i + DISP3); +; x4 += *(unsigned long *)(p + i + DISP5); +; } +; else { +; x5 += *(unsigned long *)(p + i + DISP4); +; x6 += *(unsigned long *)(p + i + DISP6); +; } +; res += x1*x2*x3*x4*x5*x6; +; } +; return res; +;} + +@arr = external local_unnamed_addr global i8*, align 8 + +define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_ds_cross_basic_blocks: +; CHECK: addi 6, 3, 4009 +; CHECK: ldx +; CHECK: ld +; CHECK: ldx +; CHECK: ld +; CHECK: ldx +; CHECK: ld +; CHECK: blr + %3 = sext i32 %1 to i64 + %4 = icmp eq i32 %1, 0 + br i1 %4, label %66, label %5 + +5: ; preds = %2 + %6 = load i8*, i8** @arr, align 8 + br label %7 + +7: ; preds = %5, %51 + %8 = phi i64 [ 1, %5 ], [ %57, %51 ] + %9 = phi i64 [ 1, %5 ], [ %56, %51 ] + %10 = phi i64 [ 1, %5 ], [ %55, %51 ] + %11 = phi i64 [ 1, %5 ], [ %54, %51 ] + %12 = phi i64 [ 1, %5 ], [ %53, %51 ] + %13 = phi i64 [ 1, %5 ], [ %52, %51 ] + %14 = phi i64 [ 0, %5 ], [ %64, %51 ] + %15 = phi i64 [ 0, %5 ], [ %63, %51 ] + %16 = getelementptr inbounds i8, i8* %6, i64 %14 + %17 = load i8, i8* %16, align 1 + %18 = urem i8 %17, 3 + %19 = icmp eq i8 %18, 1 + br i1 %19, label %20, label %30 + +20: ; preds = %7 + %21 = getelementptr inbounds i8, i8* %0, i64 %14 + %22 = getelementptr inbounds i8, i8* %21, i64 4000 + %23 = bitcast i8* %22 to i64* + %24 = load i64, i64* %23, align 8 + %25 = add i64 %24, %13 + %26 = getelementptr inbounds i8, i8* %21, i64 4001 + %27 = bitcast i8* %26 to i64* + %28 = load i64, i64* %27, align 8 + %29 = add i64 %28, %12 + br label %51 + +30: ; preds = %7 + %31 = icmp eq i8 %18, 2 + %32 = getelementptr inbounds i8, i8* %0, i64 %14 + br i1 %31, label %33, label %42 + +33: ; preds = %30 + %34 = getelementptr inbounds i8, i8* %32, i64 4002 + %35 = bitcast i8* %34 to i64* + %36 = load i64, i64* %35, align 8 + %37 = add i64 %36, %11 + %38 = getelementptr inbounds i8, i8* %32, i64 4005 + %39 = bitcast i8* %38 to i64* + %40 = load i64, i64* %39, align 8 + %41 = add i64 %40, %10 + br label %51 + +42: ; preds = %30 + %43 = getelementptr inbounds i8, i8* %32, i64 4003 + %44 = bitcast i8* %43 to i64* + %45 = load i64, i64* %44, align 8 + %46 = add i64 %45, %9 + %47 = getelementptr inbounds i8, i8* %32, i64 4009 + %48 = bitcast i8* %47 to i64* + %49 = load i64, i64* %48, align 8 + %50 = add i64 %49, %8 + br label %51 + +51: ; preds = %33, %42, %20 + %52 = phi i64 [ %25, %20 ], [ %13, %33 ], [ %13, %42 ] + %53 = phi i64 [ %29, %20 ], [ %12, %33 ], [ %12, %42 ] + %54 = phi i64 [ %11, %20 ], [ %37, %33 ], [ %11, %42 ] + %55 = phi i64 [ %10, %20 ], [ %41, %33 ], [ %10, %42 ] + %56 = phi i64 [ %9, %20 ], [ %9, %33 ], [ %46, %42 ] + %57 = phi i64 [ %8, %20 ], [ %8, %33 ], [ %50, %42 ] + %58 = mul i64 %53, %52 + %59 = mul i64 %58, %54 + %60 = mul i64 %59, %55 + %61 = mul i64 %60, %56 + %62 = mul i64 %61, %57 + %63 = add i64 %62, %15 + %64 = add nuw i64 %14, 1 + %65 = icmp ult i64 %64, %3 + br i1 %65, label %7, label %66 + +66: ; preds = %51, %2 + %67 = phi i64 [ 0, %2 ], [ %63, %51 ] + ret i64 %67 +} + +; test_ds_float: +;float test_ds_float(char *p, int count) { +; int i=0 ; +; float res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4022; +; int DISP4 = 4062; +; for (; i < count ; i++) { +; float x1 = *(float *)(p + i + DISP1); +; float x2 = *(float *)(p + i + DISP2); +; float x3 = *(float *)(p + i + DISP3); +; float x4 = *(float *)(p + i + DISP4); +; res += x1*x2*x3*x4; +; } +; return res; +;} + +define float @test_ds_float(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_ds_float: +; CHECK: addi 3, 3, 4002 +; CHECK: .LBB7_2: # +; CHECK-NEXT: lfsx 0, 3, 4 +; CHECK-NEXT: lfs 2, 0(3) +; CHECK-NEXT: xsmulsp 0, 0, 2 +; CHECK-NEXT: lfs 3, 20(3) +; CHECK-NEXT: xsmulsp 0, 0, 3 +; CHECK-NEXT: lfs 4, 60(3) +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: xsmulsp 0, 0, 4 +; CHECK-NEXT: xsaddsp 1, 1, 0 +; CHECK-NEXT: bdnz .LBB7_2 +; CHECK: blr + %3 = icmp sgt i32 %1, 0 + br i1 %3, label %4, label %28 + +4: ; preds = %2 + %5 = zext i32 %1 to i64 + br label %6 + +6: ; preds = %6, %4 + %7 = phi i64 [ 0, %4 ], [ %26, %6 ] + %8 = phi float [ 0.000000e+00, %4 ], [ %25, %6 ] + %9 = getelementptr inbounds i8, i8* %0, i64 %7 + %10 = getelementptr inbounds i8, i8* %9, i64 4001 + %11 = bitcast i8* %10 to float* + %12 = load float, float* %11, align 4 + %13 = getelementptr inbounds i8, i8* %9, i64 4002 + %14 = bitcast i8* %13 to float* + %15 = load float, float* %14, align 4 + %16 = getelementptr inbounds i8, i8* %9, i64 4022 + %17 = bitcast i8* %16 to float* + %18 = load float, float* %17, align 4 + %19 = getelementptr inbounds i8, i8* %9, i64 4062 + %20 = bitcast i8* %19 to float* + %21 = load float, float* %20, align 4 + %22 = fmul float %12, %15 + %23 = fmul float %22, %18 + %24 = fmul float %23, %21 + %25 = fadd float %8, %24 + %26 = add nuw nsw i64 %7, 1 + %27 = icmp eq i64 %26, %5 + br i1 %27, label %28, label %6 + +28: ; preds = %6, %2 + %29 = phi float [ 0.000000e+00, %2 ], [ %25, %6 ] + ret float %29 +} + +; test_ds_combine_float_int: +;float test_ds_combine_float_int(char *p, int count) { +; int i=0 ; +; float res=0; +; int DISP1 = 4001; +; int DISP2 = 4002; +; int DISP3 = 4022; +; int DISP4 = 4062; +; for (; i < count ; i++) { +; float x1 = *(float *)(p + i + DISP1); +; unsigned long x2 = *(unsigned long*)(p + i + DISP2); +; float x3 = *(float *)(p + i + DISP3); +; float x4 = *(float *)(p + i + DISP4); +; res += x1*x2*x3*x4; +; } +; return res; +;} + +define float @test_ds_combine_float_int(i8* %0, i32 signext %1) { +; CHECK-LABEL: test_ds_combine_float_int: +; CHECK: addi 3, 3, 4002 +; CHECK: .LBB8_2: # +; CHECK-NEXT: lfd 4, 0(3) +; CHECK-NEXT: lfsx 0, 3, 4 +; CHECK-NEXT: xscvuxdsp 4, 4 +; CHECK-NEXT: lfs 2, 20(3) +; CHECK-NEXT: xsmulsp 0, 0, 4 +; CHECK-NEXT: xsmulsp 0, 2, 0 +; CHECK-NEXT: lfs 3, 60(3) +; CHECK-NEXT: addi 3, 3, 1 +; CHECK-NEXT: xsmulsp 0, 3, 0 +; CHECK-NEXT: xsaddsp 1, 1, 0 +; CHECK-NEXT: bdnz .LBB8_2 +; CHECK: blr + %3 = icmp sgt i32 %1, 0 + br i1 %3, label %4, label %29 + +4: ; preds = %2 + %5 = zext i32 %1 to i64 + br label %6 + +6: ; preds = %6, %4 + %7 = phi i64 [ 0, %4 ], [ %27, %6 ] + %8 = phi float [ 0.000000e+00, %4 ], [ %26, %6 ] + %9 = getelementptr inbounds i8, i8* %0, i64 %7 + %10 = getelementptr inbounds i8, i8* %9, i64 4001 + %11 = bitcast i8* %10 to float* + %12 = load float, float* %11, align 4 + %13 = getelementptr inbounds i8, i8* %9, i64 4002 + %14 = bitcast i8* %13 to i64* + %15 = load i64, i64* %14, align 8 + %16 = getelementptr inbounds i8, i8* %9, i64 4022 + %17 = bitcast i8* %16 to float* + %18 = load float, float* %17, align 4 + %19 = getelementptr inbounds i8, i8* %9, i64 4062 + %20 = bitcast i8* %19 to float* + %21 = load float, float* %20, align 4 + %22 = uitofp i64 %15 to float + %23 = fmul float %12, %22 + %24 = fmul float %18, %23 + %25 = fmul float %21, %24 + %26 = fadd float %8, %25 + %27 = add nuw nsw i64 %7, 1 + %28 = icmp eq i64 %27, %5 + br i1 %28, label %29, label %6 + +29: ; preds = %6, %2 + %30 = phi float [ 0.000000e+00, %2 ], [ %26, %6 ] + ret float %30 +} Index: llvm/test/CodeGen/PowerPC/ppc-passname.ll =================================================================== --- llvm/test/CodeGen/PowerPC/ppc-passname.ll +++ llvm/test/CodeGen/PowerPC/ppc-passname.ll @@ -1,13 +1,13 @@ -; Test pass name: ppc-loop-preinc-prep. -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP -; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep -; STOP-BEFORE-LOOP-PREINC-PREP-NOT: "ppc-loop-preinc-prep" pass is not registered. +; Test pass name: ppc-loop-instr-form-prep. +; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-instr-form-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP +; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-instr-form-prep +; STOP-BEFORE-LOOP-PREINC-PREP-NOT: "ppc-loop-instr-form-prep" pass is not registered. ; STOP-BEFORE-LOOP-PREINC-PREP-NOT: Prepare loop for pre-inc. addressing modes -; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-LOOP-PREINC-PREP -; STOP-AFTER-LOOP-PREINC-PREP: -ppc-loop-preinc-prep -; STOP-AFTER-LOOP-PREINC-PREP-NOT: "ppc-loop-preinc-prep" pass is not registered. -; STOP-AFTER-LOOP-PREINC-PREP: Prepare loop for pre-inc. addressing modes +; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-loop-instr-form-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-LOOP-PREINC-PREP +; STOP-AFTER-LOOP-PREINC-PREP: -ppc-loop-instr-form-prep +; STOP-AFTER-LOOP-PREINC-PREP-NOT: "ppc-loop-instr-form-prep" pass is not registered. +; STOP-AFTER-LOOP-PREINC-PREP: Prepare loop for ppc preferred instruction forms ; Test pass name: ppc-toc-reg-deps. Index: llvm/test/CodeGen/PowerPC/swaps-le-1.ll =================================================================== --- llvm/test/CodeGen/PowerPC/swaps-le-1.ll +++ llvm/test/CodeGen/PowerPC/swaps-le-1.ll @@ -164,18 +164,18 @@ ; NOOPTSWAP: stxvd2x ; CHECK-P9-LABEL: @foo -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv +; CHECK-P9-DAG: lxv ; CHECK-P9-DAG: vadduwm ; CHECK-P9-DAG: vadduwm ; CHECK-P9-DAG: vadduwm @@ -184,8 +184,8 @@ ; CHECK-P9-DAG: vmuluwm ; CHECK-P9-DAG: vmuluwm ; CHECK-P9-DAG: vmuluwm -; CHECK-P9-DAG: stxvx -; CHECK-P9-DAG: stxvx -; CHECK-P9-DAG: stxvx -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv +; CHECK-P9-DAG: stxv +; CHECK-P9-DAG: stxv +; CHECK-P9-DAG: stxv