Index: llvm/trunk/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ llvm/trunk/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -335,18 +335,6 @@
   return true;
 }
 
-// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
-// to waste compile time on anything where the operands are local defs.  While
-// LVI can sometimes reason about such cases, it's not its primary purpose.
-static bool hasLocalDefs(BinaryOperator *SDI) {
-  for (Value *O : SDI->operands()) {
-    auto *I = dyn_cast<Instruction>(O);
-    if (I && I->getParent() == SDI->getParent())
-      return true;
-  }
-  return false;
-}
-
 static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
   Constant *Zero = ConstantInt::get(SDI->getType(), 0);
   for (Value *O : SDI->operands()) {
@@ -358,7 +346,7 @@
 }
 
 static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+  if (SDI->getType()->isVectorTy() ||
       !hasPositiveOperands(SDI, LVI))
     return false;
 
@@ -376,7 +364,7 @@
 /// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
 static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+  if (SDI->getType()->isVectorTy() ||
       !hasPositiveOperands(SDI, LVI))
     return false;
 
@@ -391,7 +379,7 @@
 }
 
 static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
+  if (SDI->getType()->isVectorTy())
     return false;
 
   Constant *Zero = ConstantInt::get(SDI->getType(), 0);
@@ -415,7 +403,7 @@
   if (DontProcessAdds)
     return false;
 
-  if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
+  if (AddOp->getType()->isVectorTy())
     return false;
 
   bool NSW = AddOp->hasNoSignedWrap();
Index: llvm/trunk/test/Transforms/CorrelatedValuePropagation/add.ll
===================================================================
--- llvm/trunk/test/Transforms/CorrelatedValuePropagation/add.ll
+++ llvm/trunk/test/Transforms/CorrelatedValuePropagation/add.ll
@@ -307,3 +307,26 @@
   ret void
 }
 
+; single basic block loop
+; because the loop exit condition is SLT, we can supplement the iv add
+; (iv.next def) with an nsw.
+; CHECK-LABEL: @test16(
+define i32 @test16(i32* %n, i32* %a) {
+preheader:
+  br label %loop
+
+loop:
+; CHECK: %iv.next = add nsw i32 %iv, 1
+  %iv = phi i32 [ 0, %preheader ], [ %iv.next, %loop ]
+  %acc = phi i32 [ 0, %preheader ], [ %acc.curr, %loop ]
+  %x = load atomic i32, i32* %a unordered, align 8
+  fence acquire
+  %acc.curr = add i32 %acc, %x
+  %iv.next = add i32 %iv, 1
+  %nval = load atomic i32, i32* %n unordered, align 8
+  %cmp = icmp slt i32 %iv.next, %nval
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %acc.curr
+}
Index: llvm/trunk/test/Transforms/CorrelatedValuePropagation/ashr.ll
===================================================================
--- llvm/trunk/test/Transforms/CorrelatedValuePropagation/ashr.ll
+++ llvm/trunk/test/Transforms/CorrelatedValuePropagation/ashr.ll
@@ -54,3 +54,46 @@
 exit:
   ret void
 }
+
+; looping case where loop has exactly one block
+; at the point of ashr, we know that the operand is always greater than 0,
+; because of the guard before it, so we can transform it to lshr.
+declare void @llvm.experimental.guard(i1,...)
+; CHECK-LABEL: @test4
+define void @test4(i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+; CHECK: lshr i32 %a, 1
+  %a = phi i32 [ %n, %entry ], [ %shr, %loop ]
+  %cond = icmp sgt i32 %a, 2
+  call void(i1,...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %shr = ashr i32 %a, 1
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; same test as above with assume instead of guard.
+declare void @llvm.assume(i1)
+; CHECK-LABEL: @test5
+define void @test5(i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+; CHECK: lshr i32 %a, 1
+  %a = phi i32 [ %n, %entry ], [ %shr, %loop ]
+  %cond = icmp sgt i32 %a, 4
+  call void @llvm.assume(i1 %cond)
+  %shr = ashr i32 %a, 1
+  %loopcond = icmp sgt i32 %shr, 8
+  br i1 %loopcond, label %loop, label %exit
+
+exit:
+  ret void
+}
Index: llvm/trunk/test/Transforms/CorrelatedValuePropagation/sdiv.ll
===================================================================
--- llvm/trunk/test/Transforms/CorrelatedValuePropagation/sdiv.ll
+++ llvm/trunk/test/Transforms/CorrelatedValuePropagation/sdiv.ll
@@ -52,3 +52,46 @@
 exit:
   ret void
 }
+
+; looping case where loop has exactly one block
+; at the point of sdiv, we know that %a is always greater than 0,
+; because of the guard before it, so we can transform it to udiv.
+declare void @llvm.experimental.guard(i1,...)
+; CHECK-LABEL: @test4
+define void @test4(i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+; CHECK: udiv i32 %a, 6
+  %a = phi i32 [ %n, %entry ], [ %div, %loop ]
+  %cond = icmp sgt i32 %a, 4
+  call void(i1,...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %div = sdiv i32 %a, 6
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; same test as above with assume instead of guard.
+declare void @llvm.assume(i1)
+; CHECK-LABEL: @test5
+define void @test5(i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+; CHECK: udiv i32 %a, 6
+  %a = phi i32 [ %n, %entry ], [ %div, %loop ]
+  %cond = icmp sgt i32 %a, 4
+  call void @llvm.assume(i1 %cond)
+  %div = sdiv i32 %a, 6
+  %loopcond = icmp sgt i32 %div, 8
+  br i1 %loopcond, label %loop, label %exit
+
+exit:
+  ret void
+}
Index: llvm/trunk/test/Transforms/CorrelatedValuePropagation/srem.ll
===================================================================
--- llvm/trunk/test/Transforms/CorrelatedValuePropagation/srem.ll
+++ llvm/trunk/test/Transforms/CorrelatedValuePropagation/srem.ll
@@ -19,3 +19,26 @@
 if.end:
   ret void
 }
+
+; looping case where loop has exactly one block
+; at the point of srem, we know that %a is always greater than 0,
+; because of the assume before it, so we can transform it to urem.
+declare void @llvm.assume(i1)
+; CHECK-LABEL: @test4
+define void @test4(i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+; CHECK: urem i32 %a, 6
+  %a = phi i32 [ %n, %entry ], [ %rem, %loop ]
+  %cond = icmp sgt i32 %a, 4
+  call void @llvm.assume(i1 %cond)
+  %rem = srem i32 %a, 6
+  %loopcond = icmp sgt i32 %rem, 8
+  br i1 %loopcond, label %loop, label %exit
+
+exit:
+  ret void
+}