Index: include/llvm/IR/IntrinsicsSystemZ.td
===================================================================
--- include/llvm/IR/IntrinsicsSystemZ.td
+++ include/llvm/IR/IntrinsicsSystemZ.td
@@ -374,3 +374,7 @@
                                  [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
                                  [IntrNoMem]>;
 }
+
+// Intrinsic to load hw-loop counter register (BRCT)
+def int_s390_hwloop_count : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                      [IntrNoMem]>;
Index: lib/Target/SystemZ/CMakeLists.txt
===================================================================
--- lib/Target/SystemZ/CMakeLists.txt
+++ lib/Target/SystemZ/CMakeLists.txt
@@ -12,6 +12,7 @@
 add_public_tablegen_target(SystemZCommonTableGen)
 
 add_llvm_target(SystemZCodeGen
+  SystemZBRCTLoops.cpp
   SystemZAsmPrinter.cpp
   SystemZCallingConv.cpp
   SystemZConstantPoolValue.cpp
Index: lib/Target/SystemZ/SystemZ.h
===================================================================
--- lib/Target/SystemZ/SystemZ.h
+++ lib/Target/SystemZ/SystemZ.h
@@ -132,6 +132,8 @@
 }
 } // end namespace SystemZ
 
+
+FunctionPass *createSystemZBRCTLoops(SystemZTargetMachine &TM);
 FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
                                    CodeGenOpt::Level OptLevel);
 FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
Index: lib/Target/SystemZ/SystemZBRCTLoops.cpp
===================================================================
--- /dev/null
+++ lib/Target/SystemZ/SystemZBRCTLoops.cpp
@@ -0,0 +1,259 @@
+//===-- SystemZBRCTLoops.cpp - Identify and generate BRCT loops ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the BRCT(G) branch
+// instructions.
+//
+// Criteria for BRCT loops:
+//  - Any general register may be used for the ind. var.
+//  - BRCT instruction subtracts one and then compares to 0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "SystemZ.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "brctloops"
+
+STATISTIC(NumBRCTLoops, "Number of loops converted to BRCT loops");
+
+namespace llvm {
+  void initializeSystemZBRCTLoopsPass(PassRegistry&);
+}
+
+namespace {
+  struct SystemZBRCTLoops : public FunctionPass {
+
+  public:
+    static char ID;
+
+    SystemZBRCTLoops() : FunctionPass(ID), TM(nullptr) {
+      initializeSystemZBRCTLoopsPass(*PassRegistry::getPassRegistry());
+    }
+    SystemZBRCTLoops(SystemZTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+      initializeSystemZBRCTLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+    }
+
+  private:
+    bool convertToBRCTLoop(Loop *L);
+
+  private:
+    SystemZTargetMachine *TM;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    const DataLayout *DL;
+    DominatorTree *DT;
+    const TargetLibraryInfo *LibInfo;
+    bool PreserveLCSSA;
+  };
+
+  char SystemZBRCTLoops::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SystemZBRCTLoops, "systemz-brct-loops", "SystemZ BRCT Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(SystemZBRCTLoops, "systemz-brct-loops", "SystemZ BRCT Loops",
+                    false, false)
+
+FunctionPass *llvm::createSystemZBRCTLoops(SystemZTargetMachine &TM) {
+  return new SystemZBRCTLoops(TM);
+}
+
+bool SystemZBRCTLoops::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DL = &F.getParent()->getDataLayout();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  bool MadeChange = false;
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
+       I != E; ++I) {
+    Loop *L = *I;
+    if (!L->getParentLoop())
+      MadeChange |= convertToBRCTLoop(L);
+  }
+
+  return MadeChange;
+}
+
+// SE has already said that this loop has a known invariant trip
+// count, so it is legal to transform it. Check however first if it
+// seems like this should be a clean replacement of the original
+// ind. var.
+static bool IVReplacable(BranchInst *Branch) {
+  // Look for compare instruction with one user.
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());
+  if (Cmp == nullptr || !Cmp->hasOneUse())
+    return false;
+
+  // Look for incrementing instruction (e.g. add) with two users
+  // (compare and PHI).
+  Instruction *Increment = dyn_cast<Instruction>(Cmp->getOperand(0));
+  if (Increment == nullptr || !Increment->hasNUses(2))
+    return false;
+
+  // Look for a PHI with only one user.
+  bool SimplePhiUse = false;
+  for (Use &U : Increment->operands()) {
+    if (PHINode *PHI = dyn_cast<PHINode>(U)) {
+      if (!PHI->hasOneUse())
+        return false;
+      SimplePhiUse = true;
+    }
+  }
+  if (!SimplePhiUse)
+    return false;
+
+  return true;
+}
+
+bool SystemZBRCTLoops::convertToBRCTLoop(Loop *L) {
+  bool MadeChange = false;
+
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    MadeChange |= convertToBRCTLoop(*I);
+
+  // Every iteration must branch back to header from a single latch block.
+  BasicBlock *Latch = L->getLoopLatch();
+  if (Latch == nullptr)
+    return MadeChange;
+  
+  // Look for a SCEV that suits BRCT.
+  const SCEV *ExitCount = SE->getExitCount(L, Latch);
+  DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
+        Latch->getName() << ": " << *ExitCount << "\n");
+  if (isa<SCEVCouldNotCompute>(ExitCount))
+    return MadeChange;
+  if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(ExitCount)) {
+    if (ConstEC->getValue()->isZero())
+      return MadeChange;
+  } else if (!SE->isLoopInvariant(ExitCount, L))
+    return MadeChange;
+  if (SE->getTypeSizeInBits(ExitCount->getType()) != 32 &&
+      SE->getTypeSizeInBits(ExitCount->getType()) != 64)
+    return MadeChange;
+
+  // Make sure this blocks ends with a conditional branch.
+  BranchInst *CountedExitBranch = nullptr;
+  Instruction *TI = (Latch)->getTerminator();
+  if (!TI)
+    return MadeChange;
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (!BI->isConditional())
+      return MadeChange;
+    CountedExitBranch = BI;
+  } else
+    return MadeChange;
+
+  // If there are other uses of the original ind. var, leave it alone.
+  if (!IVReplacable(CountedExitBranch))
+    return MadeChange;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one.
+  if (!Preheader)
+    Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+  if (!Preheader)
+    return MadeChange;
+  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+
+  MadeChange = true;
+
+  // Use SCEVExpander to get a Value for the trip count.
+  SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt");
+  Type *CountType = ExitCount->getType();
+  ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
+  Value *ECValue =
+      SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
+
+  // Insert the count into the preheader.
+  IRBuilder<> CountBuilder(Preheader->getTerminator());
+  Module *M = Preheader->getParent()->getParent();
+  Type *Tys[2] = {CountType, CountType};
+  Value *HWLIFunc = Intrinsic::getDeclaration(M, Intrinsic::s390_hwloop_count,
+                                              Tys);
+  Value *HWLoopCount = CountBuilder.CreateCall(HWLIFunc, ECValue, "hwloopcount");
+
+  // Insert PHI in header
+  IRBuilder<> PhiBuilder(&*L->getHeader()->begin());
+  PHINode *HWPhi = PhiBuilder.CreatePHI(CountType, 4, "hwloopphi");
+
+  // Insert decrement and compare instructions in latch.
+  IRBuilder<> CondBuilder(CountedExitBranch);
+  Value *PostInc = CondBuilder.CreateSub(HWPhi,
+                                         ConstantInt::get(CountType, 1),
+                                         "hwloopdec");
+  Value *NewCond = CondBuilder.CreateICmpUGT(PostInc,
+                                             ConstantInt::get(CountType, 0),
+                                             "hwloopcmp");
+
+  // Insert PHI operands
+  HWPhi->addIncoming(HWLoopCount, Preheader);
+  HWPhi->addIncoming(PostInc, Latch);
+
+  // Update branch
+  Value *OldCond = CountedExitBranch->getCondition();
+  CountedExitBranch->setCondition(NewCond);
+  if (!L->contains(CountedExitBranch->getSuccessor(0)))
+    // The false branch must exit the loop.
+    CountedExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  DeleteDeadPHIs(Latch);
+
+  ++NumBRCTLoops;
+  return MadeChange;
+}
Index: lib/Target/SystemZ/SystemZInstrInfo.td
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.td
+++ lib/Target/SystemZ/SystemZInstrInfo.td
@@ -202,6 +202,11 @@
   def BRCTG : BranchUnaryRI<"brctg", 0xA77, GR64>;
 }
 
+// Patterns for hwloop_count intrinsic, which is placed in pre-header
+// by SystemZBRCTLoops.
+def : Pat<(int_s390_hwloop_count GR32:$src),   (i32 GR32:$src)>;
+def : Pat<(int_s390_hwloop_count GR64:$src),   (i64 GR64:$src)>;
+
 //===----------------------------------------------------------------------===//
 // Select instructions
 //===----------------------------------------------------------------------===//
Index: lib/Target/SystemZ/SystemZTargetMachine.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -16,6 +16,10 @@
 
 using namespace llvm;
 
+static cl::
+opt<bool> DisableBRCTLoops("disable-systemz-brctloops", cl::Hidden,
+            cl::desc("Disable extra optimization for BRCT loops for SystemZ."));
+
 extern cl::opt<bool> MISchedPostRA;
 extern "C" void LLVMInitializeSystemZTarget() {
   // Register the target.
@@ -105,6 +109,7 @@
   }
 
   void addIRPasses() override;
+  bool addPreISel() override;
   bool addInstSelector() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
@@ -115,6 +120,13 @@
   TargetPassConfig::addIRPasses();
 }
 
+bool SystemZPassConfig::addPreISel() {
+  if (!DisableBRCTLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZBRCTLoops(getSystemZTargetMachine()));
+
+  return false;
+}
+
 bool SystemZPassConfig::addInstSelector() {
   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
 
Index: test/CodeGen/SystemZ/loop-brct.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/loop-brct.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Function Attrs: nounwind
+
+define zeroext i32 @f1(i32 zeroext %c, i32 zeroext %iter) {
+; CHECK-LABEL: f1:
+; CHECK: lgr %r13, %r3
+; CHECK: brct %r13, .LBB0_2
+
+entry:
+  %cmp6 = icmp eq i32 %iter, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %S.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %S.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %add1, %for.body ], [ 0, %entry ]
+  %S.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)()
+  %add = add i32 %call, %S.07
+  %add1 = add nuw i32 %i.08, 1
+  %exitcond = icmp eq i32 %add1, %iter
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare signext i32 @fun(...)
+
+; Function Attrs: nounwind
+define zeroext i32 @f2(i32 zeroext %c, i64 %iter) {
+; CHECK-LABEL: f2:
+; CHECK: ltgr %r13, %r3
+; CHECK: brctg %r13, .LBB1_2
+
+entry:
+  %cmp6 = icmp eq i64 %iter, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %S.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %S.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %add1, %for.body ], [ 0, %entry ]
+  %S.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)()
+  %add = add i32 %call, %S.07
+  %add1 = add nuw i64 %i.08, 1
+  %exitcond = icmp eq i64 %add1, %iter
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @f3(i32 zeroext %c) {
+; CHECK-LABEL: f3:
+; CHECK: lhi %r12, 16
+; CHECK: brct %r12, .LBB2_1
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %S.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)()
+  %add = add i32 %call, %S.06
+  %add1 = add nuw nsw i32 %i.07, 16
+  %cmp = icmp ult i32 %add1, 256
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Function Attrs: nounwind
+define zeroext i32 @f4(i32 zeroext %c) {
+; CHECK-LABEL: f4:
+; CHECK: lgfi %r12, 500000000
+; CHECK: brctg %r12, .LBB3_1
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i64 [ 0, %entry ], [ %add1, %for.body ]
+  %S.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %call = tail call signext i32 bitcast (i32 (...)* @fun to i32 ()*)()
+  %add = add i32 %call, %S.06
+  %add1 = add nuw nsw i64 %i.07, 10
+  %cmp = icmp ult i64 %add1, 5000000000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}