Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -135,6 +135,10 @@
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+  // Add GVN (under -O3) or EarlyCSE (otherwise) to the codegen pass pipeline.
+  void addEarlyCSEOrGVNPass();
 };
 } // end anonymous namespace
 
@@ -148,6 +152,15 @@
       [this](Function &) { return TargetTransformInfo(NVPTXTTIImpl(this)); });
 }
 
+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+  // GVN generates significantly better code than EarlyCSE for some CUDA
+  // benchmarks, so we enable it in the aggressive mode.
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
 void NVPTXPassConfig::addIRPasses() {
   // The following passes are known to not play well with virtual regs hanging
   // around after register allocation (which in our case, is *all* registers).
@@ -173,14 +186,13 @@
   // the example in reassociate-geps-and-slsr.ll.
   addPass(createStraightLineStrengthReducePass());
   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
-  // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
-  // for some of our benchmarks.
-  if (getOptLevel() == CodeGenOpt::Aggressive)
-    addPass(createGVNPass());
-  else
-    addPass(createEarlyCSEPass());
+  // EarlyCSE can reuse.
+  addEarlyCSEOrGVNPass();
   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
   addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates common expressions, so run EarlyCSE/GVN
+  // after it.
+  addEarlyCSEOrGVNPass();
 }
 
 bool NVPTXPassConfig::addInstSelector() {
Index: lib/Transforms/Scalar/NaryReassociate.cpp
===================================================================
--- lib/Transforms/Scalar/NaryReassociate.cpp
+++ lib/Transforms/Scalar/NaryReassociate.cpp
@@ -234,6 +234,7 @@
     BasicBlock *BB = Node->getBlock();
     for (auto I = BB->begin(); I != BB->end(); ++I) {
       if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) {
+        const SCEV *OldSCEV = SE->getSCEV(I);
         if (Instruction *NewI = tryReassociate(I)) {
           Changed = true;
           SE->forgetValue(I);
@@ -243,7 +244,28 @@
         }
         // Add the rewritten instruction to SeenExprs; the original instruction
         // is deleted.
-        SeenExprs[SE->getSCEV(I)].push_back(I);
+        const SCEV *NewSCEV = SE->getSCEV(I);
+        SeenExprs[NewSCEV].push_back(I);
+        // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+        // is equivalent to I. However, ScalarEvolution::getSCEV may
+        // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
+        // we reassociate
+        //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+        // to
+        //   NewI = &a[sext(i)] + sext(j).
+        //
+        // ScalarEvolution computes
+        //   getSCEV(I)    = a + 4 * sext(i + j)
+        //   getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+        // which are different SCEVs.
+        //
+        // To alleviate this issue of ScalarEvolution not always capturing
+        // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+        // map both SCEV before and after tryReassociate(I) to I.
+        //
+        // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
+        if (NewSCEV != OldSCEV)
+          SeenExprs[OldSCEV].push_back(I);
       }
     }
   }
Index: test/Transforms/NaryReassociate/NVPTX/nary-gep.ll
===================================================================
--- test/Transforms/NaryReassociate/NVPTX/nary-gep.ll
+++ test/Transforms/NaryReassociate/NVPTX/nary-gep.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -nary-reassociate -S | FileCheck %s
+; RUN: opt < %s -nary-reassociate -gvn -S | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
@@ -27,24 +27,37 @@
 
 ; foo(&a[sext(j)]);
 ; foo(&a[sext(i +nsw j)]);
+; foo(&a[sext((i +nsw j) +nsw i)]);
 ;   =>
-; t = &a[sext(j)];
-; foo(t);
-; foo(t + sext(i));
+; t1 = &a[sext(j)];
+; foo(t1);
+; t2 = t1 + sext(i);
+; foo(t2);
+; t3 = t2 + sext(i); // sext(i) should be GVN'ed.
+; foo(t3);
 define void @reassociate_gep_nsw(float* %a, i32 %i, i32 %j) {
 ; CHECK-LABEL: @reassociate_gep_nsw(
-  %1 = add nsw i32 %i, %j
-  %idxprom.1 = sext i32 %1 to i64
   %idxprom.j = sext i32 %j to i64
-  %2 = getelementptr float, float* %a, i64 %idxprom.j
+  %1 = getelementptr float, float* %a, i64 %idxprom.j
 ; CHECK: [[t1:[^ ]+]] = getelementptr float, float* %a, i64 %idxprom.j
-  call void @foo(float* %2)
+  call void @foo(float* %1)
 ; CHECK: call void @foo(float* [[t1]])
-  %3 = getelementptr float, float* %a, i64 %idxprom.1
+
+  %2 = add nsw i32 %i, %j
+  %idxprom.2 = sext i32 %2 to i64
+  %3 = getelementptr float, float* %a, i64 %idxprom.2
 ; CHECK: [[sexti:[^ ]+]] = sext i32 %i to i64
 ; CHECK: [[t2:[^ ]+]] = getelementptr float, float* [[t1]], i64 [[sexti]]
   call void @foo(float* %3)
 ; CHECK: call void @foo(float* [[t2]])
+
+  %4 = add nsw i32 %2, %i
+  %idxprom.4 = sext i32 %4 to i64
+  %5 = getelementptr float, float* %a, i64 %idxprom.4
+; CHECK: [[t3:[^ ]+]] = getelementptr float, float* [[t2]], i64 [[sexti]]
+  call void @foo(float* %5)
+; CHECK: call void @foo(float* [[t3]])
+
   ret void
 }