Index: llvm/lib/Transforms/Scalar/LoopInterchange.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -332,9 +332,13 @@
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
                            CharMatrix &DepMatrix);
 
+  /// Discover induction PHIs in the header of \p L. Induction
+  /// PHIs are added to \p Inductions.
+  bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
+
   /// Check if the loop structure is understood. We do not handle triangular
   /// loops for now.
-  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+  bool isLoopStructureUnderstood();
 
   bool currentLimitations();
 
@@ -342,6 +346,10 @@
     return OuterInnerReductions;
   }
 
+  const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
+    return InnerLoopInductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
   bool containsUnsafeInstructions(BasicBlock *BB);
@@ -365,6 +373,9 @@
   /// Set of reduction PHIs taking part of a reduction across the inner and
   /// outer loop.
   SmallPtrSet<PHINode *, 4> OuterInnerReductions;
+
+  /// Set of inner loop induction PHIs
+  SmallVector<PHINode *, 8> InnerLoopInductions;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -635,25 +646,26 @@
   return true;
 }
 
-bool LoopInterchangeLegality::isLoopStructureUnderstood(
-    PHINode *InnerInduction) {
-  unsigned Num = InnerInduction->getNumOperands();
+bool LoopInterchangeLegality::isLoopStructureUnderstood() {
   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
-  for (unsigned i = 0; i < Num; ++i) {
-    Value *Val = InnerInduction->getOperand(i);
-    if (isa<Constant>(Val))
-      continue;
-    Instruction *I = dyn_cast<Instruction>(Val);
-    if (!I)
-      return false;
-    // TODO: Handle triangular loops.
-    // e.g. for(int i=0;i<N;i++)
-    //        for(int j=i;j<N;j++)
-    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
-    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
-            InnerLoopPreheader &&
-        !OuterLoop->isLoopInvariant(I)) {
-      return false;
+  for (PHINode *InnerInduction : InnerLoopInductions) {
+    unsigned Num = InnerInduction->getNumOperands();
+    for (unsigned i = 0; i < Num; ++i) {
+      Value *Val = InnerInduction->getOperand(i);
+      if (isa<Constant>(Val))
+        continue;
+      Instruction *I = dyn_cast<Instruction>(Val);
+      if (!I)
+        return false;
+      // TODO: Handle triangular loops.
+      // e.g. for(int i=0;i<N;i++)
+      //        for(int j=i;j<N;j++)
+      unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+      if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+              InnerLoopPreheader &&
+          !OuterLoop->isLoopInvariant(I)) {
+        return false;
+      }
     }
   }
 
@@ -683,12 +695,12 @@
     // InnerInduction, or a binary operator that involves
     // InnerInduction and a constant.
     std::function<bool(Value *)> IsPathToIndVar;
-    IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool {
-      if (V == InnerInduction)
+    IsPathToIndVar = [this, &IsPathToIndVar](const Value *V) -> bool {
+      if (llvm::is_contained(InnerLoopInductions, V))
         return true;
       if (isa<Constant>(V))
         return true;
-      Instruction *I = dyn_cast<Instruction>(V);
+      const Instruction *I = dyn_cast<Instruction>(V);
       if (!I)
         return false;
       if (isa<CastInst>(I))
@@ -699,6 +711,13 @@
       return false;
     };
 
+    // In case of multiple inner loop indvars, it is okay if LHS and RHS
+    // are both inner indvar related variables.
+    if (IsPathToIndVar(Op0) && IsPathToIndVar(Op1))
+      return true;
+
+    // Otherwise we check if the cmp instruction compares an inner indvar
+    // related variable (Left) with a outer loop invariant (Right).
     if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) {
       Left = Op0;
       Right = Op1;
@@ -814,7 +833,6 @@
     return true;
   }
 
-  PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
   if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
     LLVM_DEBUG(
@@ -859,24 +877,8 @@
     return true;
   }
 
-  // TODO: Currently we handle only loops with 1 induction variable.
-  if (Inductions.size() != 1) {
-    LLVM_DEBUG(
-        dbgs() << "We currently only support loops with 1 induction variable."
-               << "Failed to interchange due to current limitation\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with 1 induction variable can be "
-                "interchanged currently.";
-    });
-    return true;
-  }
-  InnerInductionVar = Inductions.pop_back_val();
-
   // TODO: Triangular loops are not handled for now.
-  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+  if (!isLoopStructureUnderstood()) {
     LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
@@ -890,6 +892,16 @@
   return false;
 }
 
+bool LoopInterchangeLegality::findInductions(
+    Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
+  for (PHINode &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+      Inductions.push_back(&PHI);
+  }
+  return !Inductions.empty();
+}
+
 // We currently only support LCSSA PHI nodes in the inner loop exit, if their
 // users are either reduction PHIs or PHIs outside the outer loop (which means
 // the we are only interested in the final value after the loop).
@@ -1018,6 +1030,11 @@
         return false;
       }
 
+  if (!findInductions(InnerLoop, InnerLoopInductions)) {
+    LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
+    return false;
+  }
+
   if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
     ORE->emit([&]() {
@@ -1274,25 +1291,25 @@
 
 bool LoopInterchangeTransform::transform() {
   bool Transformed = false;
-  Instruction *InnerIndexVar;
 
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
     LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
-    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
-    if (!InductionPHI) {
+    auto &InductionPHIs = LIL.getInnerLoopInductions();
+    if (InductionPHIs.empty()) {
       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
       return false;
     }
 
-    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
-    else
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
-
-    // Ensure that InductionPHI is the first Phi node.
-    if (&InductionPHI->getParent()->front() != InductionPHI)
-      InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+    SmallVector<Instruction *, 8> InnerIndexVarList;
+    for (PHINode *CurInductionPHI : InductionPHIs) {
+      if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+        InnerIndexVarList.push_back(
+            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
+      else
+        InnerIndexVarList.push_back(
+            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
+    }
 
     // Create a new latch block for the inner loop. We split at the
     // current latch's terminator and then move the condition and all
@@ -1304,7 +1321,7 @@
 
     SmallSetVector<Instruction *, 4> WorkList;
     unsigned i = 0;
-    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+    auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
       for (; i < WorkList.size(); i++) {
         // Duplicate instruction and move it the new latch. Update uses that
         // have been moved.
@@ -1316,7 +1333,8 @@
         for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
           Instruction *UserI = cast<Instruction>(U.getUser());
           if (!InnerLoop->contains(UserI->getParent()) ||
-              UserI->getParent() == NewLatch || UserI == InductionPHI)
+              UserI->getParent() == NewLatch ||
+              llvm::is_contained(InductionPHIs, UserI))
             U.set(NewI);
         }
         // Add operands of moved instruction to the worklist, except if they are
@@ -1325,7 +1343,7 @@
           Instruction *OpI = dyn_cast<Instruction>(Op);
           if (!OpI ||
               this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
-              OpI == InductionPHI)
+              llvm::is_contained(InductionPHIs, OpI))
             continue;
           WorkList.insert(OpI);
         }
@@ -1339,7 +1357,8 @@
     if (CondI)
       WorkList.insert(CondI);
     MoveInstructions();
-    WorkList.insert(cast<Instruction>(InnerIndexVar));
+    for (Instruction *InnerIndexVar : InnerIndexVarList)
+      WorkList.insert(cast<Instruction>(InnerIndexVar));
     MoveInstructions();
 
     // Splits the inner loops phi nodes out into a separate basic block.
@@ -1612,7 +1631,6 @@
   updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
                   InnerLoopLatchSuccessor, DTUpdates);
 
-
   if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
   else
@@ -1639,7 +1657,8 @@
   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
   for (PHINode &PHI : InnerLoopHeader->phis())
     if (OuterInnerReductions.contains(&PHI))
-      InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+      InnerLoopPHIs.push_back(&PHI);
+
   for (PHINode &PHI : OuterLoopHeader->phis())
     if (OuterInnerReductions.contains(&PHI))
       OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
@@ -1652,6 +1671,7 @@
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
   for (PHINode *PHI : InnerLoopPHIs) {
+    LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
     PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
Index: llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
@@ -0,0 +1,240 @@
+; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+
+@b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4
+@a = common dso_local local_unnamed_addr global i32 0, align 4
+
+;; int a, c, d, e;
+;; int b[200][200];
+;; void fn1() {
+;;   for (c = 0; c < 100; c++) {
+;;     for (d = 5, e = 5; d > 0, e > 0; d--, e--)
+;;       a |= b[d][c + 9];
+;;   }
+;; }
+;
+; There are multiple inner loop indvars and only one
+; of them is used in the loop exit condition at the
+; inner loop latch.
+;
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK:    for.body:
+; CHECK:    [[INDVARS_OUTER:%.*]] = phi i64 [ [[INDVARS_OUTER_NEXT:%.*]], [[FOR_INC7:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK:    [[OR_REDUCTION_INNER:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_INC7]] ], [ [[OR_REDUCTION_OUTER:%.*]], [[FOR_BODY_PREHEADER]] ]
+; CHECK:    br label %for.body3.split
+; CHECK:    for.body3:
+; CHECK:    [[INDVAR0:%.*]] = phi i64 [ [[TMP0:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 5, [[FOR_BODY3_PREHEADER:%.*]] ]
+; CHECK:    [[INDVAR1:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY3_SPLIT]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    [[OR_REDUCTION_OUTER]] = phi i32 [ [[OR_LCSSA:%.*]], [[FOR_BODY3_SPLIT]] ], [ [[A:%.*]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    for.body3.split1:
+; CHECK:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 [[INDVAR0]], i64 [[INDEX:%.*]]
+; CHECK:    [[LOAD_VAL:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK:    [[OR]] = or i32 [[OR_REDUCTION_INNER]], [[LOAD_VAL]]
+; CHECK:    br label %for.inc7
+; CHECK:    for.body3.split:
+; CHECK:    [[OR_LCSSA]] = phi i32 [ [[OR]], [[FOR_INC7]] ]
+; CHECK:    [[TMP0]] = add nsw i64 [[INDVAR0]], -1
+; CHECK:    [[TMP1]] = add nsw i32 [[INDVAR1]], -1
+; CHECK:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK:    br i1 [[TMP2]], label %for.cond.for.end8_crit_edge, label  %for.body3
+; CHECK:    for.inc7:
+; CHECK:    [[INDVARS_OUTER_NEXT]] = add nsw i64 [[INDVARS_OUTER]], 1
+; CHECK:    br i1 [[TOBOOL:%.*]], label %for.body3.split, label %for.body
+; CHECK:    for.cond.for.end8_crit_edge:
+; CHECK:    [[OR_LCSSA_LCSSA:%.*]] = phi i32 [ [[OR_LCSSA]], [[FOR_BODY3_SPLIT]] ]
+
+entry:
+  %a = load i32, i32* @a
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc7
+  %indvars.outer = phi i64 [ 0, %entry ], [ %indvars.outer.next, %for.inc7 ]
+  %or.reduction.outer = phi i32 [ %a, %entry ], [ %or.lcssa, %for.inc7 ]
+  %index = add nsw i64 %indvars.outer, 9
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %or.reduction.inner = phi i32 [ %or.reduction.outer, %for.body ], [ %or, %for.body3 ]
+  %indvar0 = phi i64 [ 5, %for.body ], [ %indvar0.next, %for.body3 ]
+  %indvar1 = phi i32 [ 5, %for.body ], [ %indvar1.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 %indvar0, i64 %index
+  %load.val = load i32, i32* %arrayidx5, align 4
+  %or = or i32 %or.reduction.inner, %load.val
+  %indvar0.next = add nsw i64 %indvar0, -1
+  %indvar1.next = add nsw i32 %indvar1, -1
+  %tobool2 = icmp eq i32 %indvar1.next, 0
+  br i1 %tobool2, label %for.inc7, label %for.body3
+
+for.inc7:                                         ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  %indvars.outer.next = add nsw i64 %indvars.outer, 1
+  %indvars.outer.next.trunc = trunc i64 %indvars.outer.next to i32
+  %tobool = icmp eq i32 %indvars.outer.next.trunc, 100
+  br i1 %tobool, label %for.cond.for.end8_crit_edge, label %for.body
+
+for.cond.for.end8_crit_edge:                      ; preds = %for.inc7
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.inc7 ]
+  store i32 %or.lcssa.lcssa, i32* @a
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.cond.for.end8_crit_edge, %entry
+  ret void
+}
+
+;; int a, c, d, e;
+;; int b[200][200];
+;; void fn1() {
+;;   for (; c; c++) {
+;;     for (d = 5, e = 6; d + e > 0; d--, e = e - 2)
+;;       a |= b[d][c + 9];
+;;   }
+;; }
+;
+; All inner loop indvars are used in the inner latch.
+;
+define void @test2() {
+; CHECK-LABEL: @test2(
+; CHECK:    for.body:
+; CHECK:    [[INDVARS_OUTER:%.*]] = phi i64 [ [[INDVARS_OUTER_NEXT:%.*]], [[FOR_INC7:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK:    [[OR_REDUCTION_INNER:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_INC7]] ], [ [[OR_REDUCTION_OUTER:%.*]], [[FOR_BODY_PREHEADER]] ]
+; CHECK:    [[INDEX:%.*]] = add nsw i64 [[INDVARS_OUTER]], 9
+; CHECK:    for.body3:
+; CHECK:    [[INDVAR0:%.*]] = phi i64 [ [[TMP2:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    [[INDVAR1:%.*]] = phi i32 [ [[TMP0:%.*]], [[FOR_BODY3_SPLIT]] ], [ 6, [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    [[OR_REDUCTION_OUTER]] = phi i32 [ [[OR_LCSSA:%.*]], [[FOR_BODY3_SPLIT]] ], [ [[A]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    for.body3.split1:
+; CHECK:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 [[INDVAR0]], i64 [[INDEX]]
+; CHECK:    [[LOAD_VAL:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK:    [[OR]] = or i32 [[OR_REDUCTION_INNER]], [[LOAD_VAL]]
+; CHECK:    for.body3.split:
+; CHECK:    [[OR_LCSSA]] = phi i32 [ [[OR]], [[FOR_INC7]] ]
+; CHECK:    [[TMP0]] = add nsw i32 [[INDVAR1]], -2
+; CHECK:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK:    [[TMP2]] = add nsw i64 [[INDVAR0]], -1
+; CHECK:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], [[TMP1]]
+; CHECK:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
+; CHECK:    br i1 [[TMP4]], label %for.cond.for.end8_crit_edge, label  %for.body3
+; CHECK:    for.inc7:
+; CHECK:    [[INDVARS_OUTER_NEXT]] = add nsw i64 [[INDVARS_OUTER]], 1
+; CHECK:    br i1 [[TOBOOL:%.*]], label %for.body3.split, label %for.body
+; CHECK:    for.cond.for.end8_crit_edge:
+; CHECK:    [[OR_LCSSA_LCSSA:%.*]] = phi i32 [ [[OR_LCSSA]], [[FOR_BODY3_SPLIT]] ]
+;
+entry:
+  %a = load i32, i32* @a
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc7
+  %indvars.outer = phi i64 [ 0, %entry ], [ %indvars.outer.next, %for.inc7 ]
+  %or.reduction.outer = phi i32 [ %a, %entry ], [ %or.lcssa, %for.inc7 ]
+  %index = add nsw i64 %indvars.outer, 9
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %or.reduction.inner = phi i32 [ %or.reduction.outer, %for.body ], [ %or, %for.body3 ]
+  %indvar0 = phi i64 [ 5, %for.body ], [ %indvar0.next, %for.body3 ]
+  %indvar1 = phi i32 [ 6, %for.body ], [ %indvar1.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 %indvar0, i64 %index
+  %load.val = load i32, i32* %arrayidx5, align 4
+  %or = or i32 %or.reduction.inner, %load.val
+  %indvar0.next = add nsw i64 %indvar0, -1
+  %indvar1.next = add nsw i32 %indvar1, -2
+  %indvar1.next.ext = sext i32 %indvar1.next to i64
+  %indvars.add = add nsw i64 %indvar0.next, %indvar1.next.ext
+  %tobool2 = icmp eq i64 %indvars.add, 0
+  br i1 %tobool2, label %for.inc7, label %for.body3
+
+for.inc7:                                         ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  %indvars.outer.next = add nsw i64 %indvars.outer, 1
+  %indvars.outer.next.trunc = trunc i64 %indvars.outer.next to i32
+  %tobool = icmp eq i32 %indvars.outer.next.trunc, 100
+  br i1 %tobool, label %for.cond.for.end8_crit_edge, label %for.body
+
+for.cond.for.end8_crit_edge:                      ; preds = %for.inc7
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.inc7 ]
+  store i32 %or.lcssa.lcssa, i32* @a
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.cond.for.end8_crit_edge, %entry
+  ret void
+}
+
+;; int a, c, d, e;
+;; int b[200][200];
+;; void fn1() {
+;;   for (; c; c++) {
+;;     d = 5;
+;;     e = 49;
+;;     for (; d != e; d++, e--)
+;;       a |= b[d][c + 9];
+;;   }
+;; }
+;
+; Two inner loop indvars are involved in the inner loop exit
+; condition as LHS and RHS.
+define void @test3() {
+; CHECK-LABEL: @test3(
+; CHECK:    for.body:
+; CHECK:    [[INDVARS_OUTER:%.*]] = phi i64 [ [[INDVARS_OUTER_NEXT:%.*]], [[FOR_INC7:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK:    [[OR_REDUCTION_INNER:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_INC7]] ], [ [[OR_REDUCTION_OUTER:%.*]], [[FOR_BODY_PREHEADER]] ]
+; CHECK:    [[INDEX:%.*]] = add nsw i64 [[INDVARS_OUTER]], 9
+; CHECK:    br label %for.body3.split
+; CHECK:    for.body3:
+; CHECK:    [[INDVAR0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    [[INDVAR1:%.*]] = phi i32 [ [[TMP0:%.*]], [[FOR_BODY3_SPLIT]] ], [ 49, [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    [[OR_REDUCTION_OUTER]] = phi i32 [ [[OR_LCSSA:%.*]], [[FOR_BODY3_SPLIT]] ], [ [[A]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK:    for.body3.split1:
+; CHECK:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i32 [[INDVAR0]], i64 [[INDEX]]
+; CHECK:    [[LOAD_VAL:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK:    [[OR]] = or i32 [[OR_REDUCTION_INNER]], [[LOAD_VAL]]
+; CHECK:    br label %for.inc7
+; CHECK:    for.body3.split:
+; CHECK:    [[OR_LCSSA]] = phi i32 [ [[OR]], [[FOR_INC7]] ]
+; CHECK:    [[TMP0]] = add nsw i32 [[INDVAR1]], -1
+; CHECK:    [[TMP1]] = add nsw i32 [[INDVAR0]], 1
+; CHECK:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[TMP0]]
+; CHECK:    br i1 [[TMP2]], label %for.cond.for.end8_crit_edge, label  %for.body3
+; CHECK:    for.inc7:
+; CHECK:    [[INDVARS_OUTER_NEXT]] = add nsw i64 [[INDVARS_OUTER]], 1
+; CHECK:    br i1 [[TOBOOL:%.*]], label %for.body3.split, label %for.body
+; CHECK:    for.cond.for.end8_crit_edge:
+; CHECK:    [[OR_LCSSA_LCSSA:%.*]] = phi i32 [ [[OR_LCSSA]], [[FOR_BODY3_SPLIT]] ]
+;
+
+entry:
+  %a = load i32, i32* @a
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc7
+  %indvars.outer = phi i64 [ 0, %entry ], [ %indvars.outer.next, %for.inc7 ]
+  %or.reduction.outer = phi i32 [ %a, %entry ], [ %or.lcssa, %for.inc7 ]
+  %index = add nsw i64 %indvars.outer, 9
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %or.reduction.inner = phi i32 [ %or.reduction.outer, %for.body ], [ %or, %for.body3 ]
+  %indvar0 = phi i32 [ 5, %for.body ], [ %indvar0.next, %for.body3 ]
+  %indvar1 = phi i32 [ 49, %for.body ], [ %indvar1.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i32 %indvar0, i64 %index
+  %load.val = load i32, i32* %arrayidx5, align 4
+  %or = or i32 %or.reduction.inner, %load.val
+  %indvar0.next = add nsw i32 %indvar0, 1
+  %indvar1.next = add nsw i32 %indvar1, -1
+  %tobool2 = icmp eq i32 %indvar0.next, %indvar1.next
+  br i1 %tobool2, label %for.inc7, label %for.body3
+
+for.inc7:                                         ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  %indvars.outer.next = add nsw i64 %indvars.outer, 1
+  %tobool = icmp eq i64 %indvars.outer.next, 100
+  br i1 %tobool, label %for.cond.for.end8_crit_edge, label %for.body
+
+for.cond.for.end8_crit_edge:                      ; preds = %for.inc7
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.inc7 ]
+  store i32 %or.lcssa.lcssa, i32* @a
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.cond.for.end8_crit_edge, %entry
+  ret void
+}
\ No newline at end of file