Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp =================================================================== --- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -430,8 +430,10 @@ bool reuniteExts(Instruction *I); /// Find the closest dominator of that is equivalent to . - Instruction *findClosestMatchingDominator(const SCEV *Key, - Instruction *Dominatee); + Instruction *findClosestMatchingDominator(const SCEV *Key, + Instruction *Dominatee, + DenseMap> &DominatingExprs); + /// Verify F is free of dead code. void verifyNoDeadCode(Function &F); @@ -455,7 +457,8 @@ /// multiple GEPs with a single index. bool LowerGEP; - DenseMap> DominatingExprs; + DenseMap> DominatingAdds; + DenseMap> DominatingSubs; }; } // end anonymous namespace @@ -1140,7 +1143,8 @@ } Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( - const SCEV *Key, Instruction *Dominatee) { + const SCEV *Key, Instruction *Dominatee, + DenseMap> &DominatingExprs) { auto Pos = DominatingExprs.find(Key); if (Pos == DominatingExprs.end()) return nullptr; @@ -1168,12 +1172,11 @@ // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). // TODO: handle zext Value *LHS = nullptr, *RHS = nullptr; - if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) || - match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { + if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { if (LHS->getType() == RHS->getType()) { const SCEV *Key = SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); - if (auto *Dom = findClosestMatchingDominator(Key, I)) { + if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) { Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); NewSExt->takeName(I); I->replaceAllUsesWith(NewSExt); @@ -1181,23 +1184,41 @@ return true; } } + } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { + if (LHS->getType() == RHS->getType()) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) { + Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + NewSExt->takeName(I); + I->replaceAllUsesWith(NewSExt); + RecursivelyDeleteTriviallyDeadInstructions(I); + return true; + } + } } // Add I to DominatingExprs if it's an add/sub that can't sign overflow. - if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || - match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { + if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) { if (programUndefinedIfFullPoison(I)) { const SCEV *Key = SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); - DominatingExprs[Key].push_back(I); + DominatingAdds[Key].push_back(I); } + } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { + if (programUndefinedIfFullPoison(I)) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + DominatingSubs[Key].push_back(I); + } } return false; } bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { bool Changed = false; - DominatingExprs.clear(); + DominatingAdds.clear(); + DominatingSubs.clear(); for (const auto Node : depth_first(DT)) { BasicBlock *BB = Node->getBlock(); for (auto I = BB->begin(); I != BB->end(); ) { Index: test/Transforms/SeparateConstOffsetFromGEP/PowerPC/test-nomatch-add-and-sub.ll =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/PowerPC/test-nomatch-add-and-sub.ll +++ test/Transforms/SeparateConstOffsetFromGEP/PowerPC/test-nomatch-add-and-sub.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O3 -mtriple=powerpc64le-gnu-linux < %s | FileCheck %s + +; Though the error in this function only involves a handful of instructions, +; this bug requires the ~75 other instructions to actually occur due to other +; llc analysis passes and the general intermittentness of the issue. This test +; case is considerably reduced from the originally generated code. + +; Function Attrs: noinline nounwind +define void @tri_solve_reduced(i32 signext %size, +; CHECK-LABEL: tri_solve_reduced: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi 9, 3, -2 +; CHECK-NEXT: std 29, -24(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, -32(1) # 8-byte Folded Spill +; CHECK-NEXT: li 8, 0 +; CHECK-NEXT: srawi 10, 9, 31 +; CHECK-NEXT: and 10, 10, 9 +; CHECK-NEXT: subf 9, 10, 9 +; CHECK-NEXT: addi 10, 3, -1 +; CHECK-NEXT: clrldi 9, 9, 32 +; CHECK-NEXT: extsw 11, 10 +; CHECK-NEXT: addi 12, 9, 1 +; CHECK-NEXT: sldi 9, 7, 4 +; CHECK-NEXT: mtctr 12 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %loopend +; +; CHECK: addi 8, 8, 1 +; CHECK-NEXT: stfdx 0, 0, 12 +; CHECK-NEXT: bdz .LBB0_8 +; CHECK-NEXT: .LBB0_2: # %outer +; CHECK-NEXT: # =>This Loop Header: Depth=1 +; CHECK-NEXT: # Child Loop BB0_7 Depth 2 +; CHECK-NEXT: mr 0, 10 +; CHECK-NEXT: subf 10, 7, 10 +; CHECK-NEXT: mr 30, 11 +; CHECK-NEXT: slwi 12, 10, 1 +; CHECK-NEXT: cmpw 30, 3 +; CHECK-NEXT: extsw 11, 12 +; CHECK-NEXT: mr 12, 6 +; CHECK-NEXT: sldi 11, 11, 3 +; CHECK-NEXT: lfdux 0, 12, 11 +; CHECK-NEXT: addi 11, 30, -1 +; CHECK-NEXT: bge 0, .LBB0_1 +; CHECK-NEXT: # %bb.3: # %outercheck +; +; CHECK: andi. 29, 8, 1 +; CHECK-NEXT: extsw 0, 0 +; CHECK-NEXT: bc 12, 1, .LBB0_5 +; CHECK-NEXT: # %bb.4: # %innerhead +; +; CHECK: mullw 29, 11, 5 +; CHECK-NEXT: add 0, 0, 7 +; CHECK-NEXT: add 29, 30, 29 +; CHECK-NEXT: addi 30, 30, 1 +; CHECK-NEXT: slwi 29, 29, 1 +; CHECK-NEXT: extsw 29, 29 +; CHECK-NEXT: sldi 29, 29, 3 +; CHECK-NEXT: lfdx 1, 4, 29 +; CHECK-NEXT: xssubdp 0, 0, 1 +; CHECK-NEXT: .LBB0_5: # %innercheck +; +; CHECK: cmplwi 8, 0 +; CHECK-NEXT: beq 0, .LBB0_1 +; CHECK-NEXT: # %bb.6: # %innerbody.preheader +; +; CHECK: sldi 0, 0, 4 +; CHECK-NEXT: subf 2, 30, 3 +; CHECK-NEXT: add 0, 4, 0 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB0_7: # %innerbody +; CHECK-NEXT: # Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-NEXT: lfdx 1, 0, 0 +; CHECK-NEXT: addi 2, 2, -2 +; CHECK-NEXT: add 0, 0, 9 +; CHECK-NEXT: cmplwi 2, 0 +; CHECK-NEXT: xssubdp 0, 1, 0 +; CHECK-NEXT: bne 0, .LBB0_7 +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_8: # %fin +; CHECK-NEXT: ld 2, -32(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 30, -16(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 29, -24(1) # 8-byte Folded Reload +; CHECK-NEXT: blr +; CHECK : cmplwi 8, 0 + double* nocapture readonly %matrix, i32 signext %lda, + double* nocapture %vector, i32 signext %stridelen) { +entry: + %initdec = add i32 %size, -1 + %bigdec = sext i32 %initdec to i64 + ; Extended operand to matchingadd + %bigincX = sext i32 %stridelen to i64 + br label %outer + +outer: ; preds = %entry, %loopend + %revi = phi i32 [ 0, %entry ], [ %revinc, %loopend ] + %bigi = phi i64 [ %bigdec, %entry ], [ %endeci, %loopend ] + ; Short operand to matchingsub + %indexolotl = phi i32 [ %initdec, %entry ], [ %matchingsub, %loopend ] + %colindex = phi i32 [ %initdec, %entry ], [ %nexti, %loopend ] + + ; This subtraction matches matchingadd during the Separate Const Offset from + ; GEP optimization pass sext-merging phase, which is not a valid transform + ; because adds and subtracts are not the same type of operation. + + %matchingsub = sub nsw i32 %indexolotl, %stridelen + %nexti = add nsw i32 %colindex, -1 + %smallXoffset = shl nsw i32 %matchingsub, 1 + %bigXoffset = sext i32 %smallXoffset to i64 + %arrayidx = getelementptr inbounds double, double* %vector, i64 %bigXoffset + %numvar4 = load double, double* %arrayidx, align 8 + %checkout = icmp slt i32 %colindex, %size + br i1 %checkout, label %outercheck, label %loopend + +outercheck: ; preds = %outer + ; Extended operand to matchingadd + %longxtolotl = sext i32 %indexolotl to i64 ; Inst of interest + %rowindex = mul nsw i32 %nexti, %lda + %qcheck = and i32 %revi, 1 + %co2 = icmp eq i32 %qcheck, 0 + br i1 %co2, label %innerhead, label %innercheck + +innerhead: ; preds = %outercheck + %findex = add nsw i32 %colindex, %rowindex + %doubledex = shl nsw i32 %findex, 1 + %bigindex = sext i32 %doubledex to i64 + %finptr = getelementptr inbounds double, double* %matrix, i64 %bigindex + %loadedA = load double, double* %finptr, align 8 + %headreal = fsub double %numvar4, %loadedA + + ; The match to matchingsub- At compile time on powerpc with -O2, this changes + ; from an add instruction into sext i32 %matchingsub to i64 because the + ; extended operands to this instruction match the un-extended operands to + ; the earlier matchingsub. + + %matchingadd = add nsw i64 %longxtolotl, %bigincX ; Inst of interest + %biginext = add nsw i64 %bigi, 1 ; Possible: j++ + %incol = add nsw i32 %colindex, 1 + br label %innercheck + +innercheck: ; preds = %innerhead, %outercheck + %innerreal = phi double [ %headreal, %innerhead ], [ undef, %outercheck ] + %incxticheck = phi i64 [ %matchingadd, %innerhead ], [ %longxtolotl, %outercheck ] + %icinext = phi i64 [ %biginext, %innerhead ], [ %bigi, %outercheck ] + %ichkcol = phi i32 [ %incol, %innerhead ], [ %colindex, %outercheck ] + %icreal = phi double [ %headreal, %innerhead ], [ %numvar4, %outercheck ] + %icrevicheck = icmp eq i32 %revi, 0 + br i1 %icrevicheck, label %loopend, label %innerbody + +innerbody: ; preds = %innercheck, %innerbody + %innerloopxtolotl = phi i64 [ %bodyinctolotl, %innerbody ], [ %incxticheck, %innercheck ] + %bodyi = phi i64 [ %bodyinci, %innerbody ], [ %icinext, %innercheck ] + %bodyj = phi i32 [ %dinextj, %innerbody ], [ %ichkcol, %innercheck ] + %bodyreal = phi double [ %newreal, %innerbody ], [ %icreal, %innercheck ] + %nextj = add nsw i32 %bodyj, 1 + %doublotl = shl nsw i64 %innerloopxtolotl, 1 + %bodyptr = getelementptr inbounds double, double* %matrix, i64 %doublotl + %bodyAload = load double, double* %bodyptr, align 8 + %newreal = fsub double %bodyAload, %bodyreal + %bodyinctolotl = add i64 %innerloopxtolotl, %bigincX + %bodyinci = add nsw i64 %bodyi, 2 + %dinextj = add nsw i32 %bodyj, 2 + %smallinci = trunc i64 %bodyinci to i32 + %bodycmp = icmp eq i32 %smallinci, %size + br i1 %bodycmp, label %loopend, label %innerbody + +loopend: ; preds = %innercheck, %innerbody, %outer + %endreal = phi double [ %numvar4, %outer ], [ %innerreal, %innercheck ], [ %newreal, %innerbody ] + store double %endreal, double* %arrayidx, align 8 + %endcmp = icmp sgt i32 %nexti, 0 + %endeci = add nsw i64 %bigi, -1 + %revinc = add i32 %revi, 1 + br i1 %endcmp, label %outer, label %fin + +fin: + ret void +}