Index: include/llvm/IR/IntrinsicsSystemZ.td =================================================================== --- include/llvm/IR/IntrinsicsSystemZ.td +++ include/llvm/IR/IntrinsicsSystemZ.td @@ -374,3 +374,7 @@ [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; } + +// Intrinsic to load hw-loop counter register (BRCT) +def int_s390_hwloop_count : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], + [IntrNoMem]>; Index: lib/Target/SystemZ/CMakeLists.txt =================================================================== --- lib/Target/SystemZ/CMakeLists.txt +++ lib/Target/SystemZ/CMakeLists.txt @@ -12,6 +12,7 @@ add_public_tablegen_target(SystemZCommonTableGen) add_llvm_target(SystemZCodeGen + SystemZBRCTLoops.cpp SystemZAsmPrinter.cpp SystemZCallingConv.cpp SystemZConstantPoolValue.cpp Index: lib/Target/SystemZ/SystemZ.h =================================================================== --- lib/Target/SystemZ/SystemZ.h +++ lib/Target/SystemZ/SystemZ.h @@ -132,6 +132,8 @@ } } // end namespace SystemZ + +FunctionPass *createSystemZBRCTLoops(SystemZTargetMachine &TM); FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel); FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM); Index: lib/Target/SystemZ/SystemZBRCTLoops.cpp =================================================================== --- /dev/null +++ lib/Target/SystemZ/SystemZBRCTLoops.cpp @@ -0,0 +1,259 @@ +//===-- SystemZBRCTLoops.cpp - Identify and generate BRCT loops ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies loops where we can generate the BRCT(G) branch +// instructions. +// +// Criteria for BRCT loops: +// - Any general register may be used for the ind. var. +// - BRCT instruction subtracts one and then compares to 0. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "SystemZ.h" +#include "SystemZTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/PassSupport.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "brctloops" + +STATISTIC(NumBRCTLoops, "Number of loops converted to BRCT loops"); + +namespace llvm { + void initializeSystemZBRCTLoopsPass(PassRegistry&); +} + +namespace { + struct SystemZBRCTLoops : public FunctionPass { + + public: + static char ID; + + SystemZBRCTLoops() : FunctionPass(ID), TM(nullptr) { + initializeSystemZBRCTLoopsPass(*PassRegistry::getPassRegistry()); + } + SystemZBRCTLoops(SystemZTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + initializeSystemZBRCTLoopsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + } + + private: + bool convertToBRCTLoop(Loop *L); + + private: + SystemZTargetMachine *TM; + LoopInfo *LI; + ScalarEvolution *SE; + const DataLayout *DL; + DominatorTree *DT; + const TargetLibraryInfo *LibInfo; + bool PreserveLCSSA; + }; + + char SystemZBRCTLoops::ID = 0; +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(SystemZBRCTLoops, "systemz-brct-loops", "SystemZ BRCT Loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(SystemZBRCTLoops, "systemz-brct-loops", "SystemZ BRCT Loops", + false, false) + +FunctionPass *llvm::createSystemZBRCTLoops(SystemZTargetMachine &TM) { + return new SystemZBRCTLoops(TM); +} + +bool SystemZBRCTLoops::runOnFunction(Function &F) { + LI = &getAnalysis().getLoopInfo(); + SE = &getAnalysis().getSE(); + DT = &getAnalysis().getDomTree(); + DL = &F.getParent()->getDataLayout(); + auto *TLIP = getAnalysisIfAvailable(); + LibInfo = TLIP ? &TLIP->getTLI() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + + bool MadeChange = false; + + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); + I != E; ++I) { + Loop *L = *I; + if (!L->getParentLoop()) + MadeChange |= convertToBRCTLoop(L); + } + + return MadeChange; +} + +// SE has already said that this loop has a known invariant trip +// count, so it is legal to transform it. Check however first if it +// seems like this should be a clean replacement of the original +// ind. var. +static bool IVReplacable(BranchInst *Branch) { + // Look for compare instruction with one user. + ICmpInst *Cmp = dyn_cast(Branch->getCondition()); + if (Cmp == nullptr || !Cmp->hasOneUse()) + return false; + + // Look for incrementing instruction (e.g. add) with two users + // (compare and PHI). + Instruction *Increment = dyn_cast(Cmp->getOperand(0)); + if (Increment == nullptr || !Increment->hasNUses(2)) + return false; + + // Look for a PHI with only one user. + bool SimplePhiUse = false; + for (Use &U : Increment->operands()) { + if (PHINode *PHI = dyn_cast(U)) { + if (!PHI->hasOneUse()) + return false; + SimplePhiUse = true; + } + } + if (!SimplePhiUse) + return false; + + return true; +} + +bool SystemZBRCTLoops::convertToBRCTLoop(Loop *L) { + bool MadeChange = false; + + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + MadeChange |= convertToBRCTLoop(*I); + + // Every iteration must branch back to header from a single latch block. + BasicBlock *Latch = L->getLoopLatch(); + if (Latch == nullptr) + return MadeChange; + + // Look for a SCEV that suits BRCT. + const SCEV *ExitCount = SE->getExitCount(L, Latch); + DEBUG(dbgs() << "Exit Count for " << *L << " from block " << + Latch->getName() << ": " << *ExitCount << "\n"); + if (isa(ExitCount)) + return MadeChange; + if (const SCEVConstant *ConstEC = dyn_cast(ExitCount)) { + if (ConstEC->getValue()->isZero()) + return MadeChange; + } else if (!SE->isLoopInvariant(ExitCount, L)) + return MadeChange; + if (SE->getTypeSizeInBits(ExitCount->getType()) != 32 && + SE->getTypeSizeInBits(ExitCount->getType()) != 64) + return MadeChange; + + // Make sure this blocks ends with a conditional branch. + BranchInst *CountedExitBranch = nullptr; + Instruction *TI = (Latch)->getTerminator(); + if (!TI) + return MadeChange; + if (BranchInst *BI = dyn_cast(TI)) { + if (!BI->isConditional()) + return MadeChange; + CountedExitBranch = BI; + } else + return MadeChange; + + // If there are other uses of the original ind. var, leave it alone. + if (!IVReplacable(CountedExitBranch)) + return MadeChange; + + BasicBlock *Preheader = L->getLoopPreheader(); + + // If we don't have a preheader, then insert one. + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); + if (!Preheader) + return MadeChange; + DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n"); + + MadeChange = true; + + // Use SCEVExpander to get a Value for the trip count. + SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt"); + Type *CountType = ExitCount->getType(); + ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); + Value *ECValue = + SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); + + // Insert the count into the preheader. + IRBuilder<> CountBuilder(Preheader->getTerminator()); + Module *M = Preheader->getParent()->getParent(); + Type *Tys[2] = {CountType, CountType}; + Value *HWLIFunc = Intrinsic::getDeclaration(M, Intrinsic::s390_hwloop_count, + Tys); + Value *HWLoopCount = CountBuilder.CreateCall(HWLIFunc, ECValue, "hwloopcount"); + + // Insert PHI in header + IRBuilder<> PhiBuilder(&*L->getHeader()->begin()); + PHINode *HWPhi = PhiBuilder.CreatePHI(CountType, 4, "hwloopphi"); + + // Insert decrement and compare instructions in latch. + IRBuilder<> CondBuilder(CountedExitBranch); + Value *PostInc = CondBuilder.CreateSub(HWPhi, + ConstantInt::get(CountType, 1), + "hwloopdec"); + Value *NewCond = CondBuilder.CreateICmpUGT(PostInc, + ConstantInt::get(CountType, 0), + "hwloopcmp"); + + // Insert PHI operands + HWPhi->addIncoming(HWLoopCount, Preheader); + HWPhi->addIncoming(PostInc, Latch); + + // Update branch + Value *OldCond = CountedExitBranch->getCondition(); + CountedExitBranch->setCondition(NewCond); + if (!L->contains(CountedExitBranch->getSuccessor(0))) + // The false branch must exit the loop. + CountedExitBranch->swapSuccessors(); + + // The old condition may be dead now, and may have even created a dead PHI + // (the original induction variable). + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + DeleteDeadPHIs(Latch); + + ++NumBRCTLoops; + return MadeChange; +} Index: lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- lib/Target/SystemZ/SystemZInstrInfo.td +++ lib/Target/SystemZ/SystemZInstrInfo.td @@ -202,6 +202,11 @@ def BRCTG : BranchUnaryRI<"brctg", 0xA77, GR64>; } +// Patterns for hwloop_count intrinsic, which is placed in pre-header +// by SystemZBRCTLoops. +def : Pat<(int_s390_hwloop_count GR32:$src), (i32 GR32:$src)>; +def : Pat<(int_s390_hwloop_count GR64:$src), (i64 GR64:$src)>; + //===----------------------------------------------------------------------===// // Select instructions //===----------------------------------------------------------------------===// Index: lib/Target/SystemZ/SystemZTargetMachine.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetMachine.cpp +++ lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -16,6 +16,10 @@ using namespace llvm; +static cl:: +opt DisableBRCTLoops("disable-systemz-brctloops", cl::Hidden, + cl::desc("Disable extra optimization for BRCT loops for SystemZ.")); + extern cl::opt MISchedPostRA; extern "C" void LLVMInitializeSystemZTarget() { // Register the target. @@ -105,6 +109,7 @@ } void addIRPasses() override; + bool addPreISel() override; bool addInstSelector() override; void addPreSched2() override; void addPreEmitPass() override; @@ -115,6 +120,13 @@ TargetPassConfig::addIRPasses(); } +bool SystemZPassConfig::addPreISel() { + if (!DisableBRCTLoops && getOptLevel() != CodeGenOpt::None) + addPass(createSystemZBRCTLoops(getSystemZTargetMachine())); + + return false; +} + bool SystemZPassConfig::addInstSelector() { addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel())); Index: test/CodeGen/SystemZ/loop-brct.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/loop-brct.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +; Function Attrs: nounwind + +define zeroext i32 @f1(i32 zeroext %c, i32 zeroext %iter) { +; CHECK-LABEL: f1: +; CHECK: lgr %r13, %r3 +; CHECK: brct %r13, .LBB0_2 + +entry: + %cmp6 = icmp eq i32 %iter, 0 + br i1 %cmp6, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %S.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %S.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.08 = phi i32 [ %add1, %for.body ], [ 0, %entry ] + %S.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)() + %add = add i32 %call, %S.07 + %add1 = add nuw i32 %i.08, 1 + %exitcond = icmp eq i32 %add1, %iter + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +declare signext i32 @fun(...) + +; Function Attrs: nounwind +define zeroext i32 @f2(i32 zeroext %c, i64 %iter) { +; CHECK-LABEL: f2: +; CHECK: ltgr %r13, %r3 +; CHECK: brctg %r13, .LBB1_2 + +entry: + %cmp6 = icmp eq i64 %iter, 0 + br i1 %cmp6, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %S.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %S.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %add1, %for.body ], [ 0, %entry ] + %S.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)() + %add = add i32 %call, %S.07 + %add1 = add nuw i64 %i.08, 1 + %exitcond = icmp eq i64 %add1, %iter + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: nounwind +define zeroext i32 @f3(i32 zeroext %c) { +; CHECK-LABEL: f3: +; CHECK: lhi %r12, 16 +; CHECK: brct %r12, .LBB2_1 + +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %S.06 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)() + %add = add i32 %call, %S.06 + %add1 = add nuw nsw i32 %i.07, 16 + %cmp = icmp ult i32 %add1, 256 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Function Attrs: nounwind +define zeroext i32 @f4(i32 zeroext %c) { +; CHECK-LABEL: f4: +; CHECK: lgfi %r12, 500000000 +; CHECK: brctg %r12, .LBB3_1 + +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %i.07 = phi i64 [ 0, %entry ], [ %add1, %for.body ] + %S.06 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)() + %add = add i32 %call, %S.06 + %add1 = add nuw nsw i64 %i.07, 10 + %cmp = icmp ult i64 %add1, 5000000000 + br i1 %cmp, label %for.body, label %for.cond.cleanup +}