Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -135,6 +135,10 @@ FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + +private: + // Add GVN (under -O3) or EarlyCSE (otherwise) to the codegen pass pipeline. + void addEarlyCSEOrGVNPass(); }; } // end anonymous namespace @@ -148,6 +152,15 @@ [this](Function &) { return TargetTransformInfo(NVPTXTTIImpl(this)); }); } +void NVPTXPassConfig::addEarlyCSEOrGVNPass() { + // GVN generates significantly better code than EarlyCSE for some CUDA + // benchmarks, so we enable it in the aggressive mode. + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); +} + void NVPTXPassConfig::addIRPasses() { // The following passes are known to not play well with virtual regs hanging // around after register allocation (which in our case, is *all* registers). @@ -173,14 +186,13 @@ // the example in reassociate-geps-and-slsr.ll. addPass(createStraightLineStrengthReducePass()); // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or - // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE - // for some of our benchmarks. - if (getOptLevel() == CodeGenOpt::Aggressive) - addPass(createGVNPass()); - else - addPass(createEarlyCSEPass()); + // EarlyCSE can reuse. + addEarlyCSEOrGVNPass(); // Run NaryReassociate after EarlyCSE/GVN to be more effective. addPass(createNaryReassociatePass()); + // NaryReassociate on GEPs creates common expressions, so run EarlyCSE/GVN + // after it. + addEarlyCSEOrGVNPass(); } bool NVPTXPassConfig::addInstSelector() { Index: lib/Transforms/Scalar/NaryReassociate.cpp =================================================================== --- lib/Transforms/Scalar/NaryReassociate.cpp +++ lib/Transforms/Scalar/NaryReassociate.cpp @@ -234,6 +234,7 @@ BasicBlock *BB = Node->getBlock(); for (auto I = BB->begin(); I != BB->end(); ++I) { if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { + const SCEV *OldSCEV = SE->getSCEV(I); if (Instruction *NewI = tryReassociate(I)) { Changed = true; SE->forgetValue(I); @@ -243,7 +244,28 @@ } // Add the rewritten instruction to SeenExprs; the original instruction // is deleted. - SeenExprs[SE->getSCEV(I)].push_back(I); + const SCEV *NewSCEV = SE->getSCEV(I); + SeenExprs[NewSCEV].push_back(I); + // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) + // is equivalent to I. However, ScalarEvolution::getSCEV may + // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose + // we reassociate + // I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4 + // to + // NewI = &a[sext(i)] + sext(j). + // + // ScalarEvolution computes + // getSCEV(I) = a + 4 * sext(i + j) + // getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j) + // which are different SCEVs. + // + // To alleviate this issue of ScalarEvolution not always capturing + // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can + // map both SCEV before and after tryReassociate(I) to I. + // + // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll. + if (NewSCEV != OldSCEV) + SeenExprs[OldSCEV].push_back(I); } } } Index: test/Transforms/NaryReassociate/NVPTX/nary-gep.ll =================================================================== --- test/Transforms/NaryReassociate/NVPTX/nary-gep.ll +++ test/Transforms/NaryReassociate/NVPTX/nary-gep.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -nary-reassociate -S | FileCheck %s +; RUN: opt < %s -nary-reassociate -gvn -S | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-unknown-unknown" @@ -27,24 +27,37 @@ ; foo(&a[sext(j)]); ; foo(&a[sext(i +nsw j)]); +; foo(&a[sext((i +nsw j) +nsw i)]); ; => -; t = &a[sext(j)]; -; foo(t); -; foo(t + sext(i)); +; t1 = &a[sext(j)]; +; foo(t1); +; t2 = t1 + sext(i); +; foo(t2); +; t3 = t2 + sext(i); // sext(i) should be GVN'ed. +; foo(t3); define void @reassociate_gep_nsw(float* %a, i32 %i, i32 %j) { ; CHECK-LABEL: @reassociate_gep_nsw( - %1 = add nsw i32 %i, %j - %idxprom.1 = sext i32 %1 to i64 %idxprom.j = sext i32 %j to i64 - %2 = getelementptr float, float* %a, i64 %idxprom.j + %1 = getelementptr float, float* %a, i64 %idxprom.j ; CHECK: [[t1:[^ ]+]] = getelementptr float, float* %a, i64 %idxprom.j - call void @foo(float* %2) + call void @foo(float* %1) ; CHECK: call void @foo(float* [[t1]]) - %3 = getelementptr float, float* %a, i64 %idxprom.1 + + %2 = add nsw i32 %i, %j + %idxprom.2 = sext i32 %2 to i64 + %3 = getelementptr float, float* %a, i64 %idxprom.2 ; CHECK: [[sexti:[^ ]+]] = sext i32 %i to i64 ; CHECK: [[t2:[^ ]+]] = getelementptr float, float* [[t1]], i64 [[sexti]] call void @foo(float* %3) ; CHECK: call void @foo(float* [[t2]]) + + %4 = add nsw i32 %2, %i + %idxprom.4 = sext i32 %4 to i64 + %5 = getelementptr float, float* %a, i64 %idxprom.4 +; CHECK: [[t3:[^ ]+]] = getelementptr float, float* [[t2]], i64 [[sexti]] + call void @foo(float* %5) +; CHECK: call void @foo(float* [[t3]]) + ret void }