Index: lib/Transforms/Scalar/IndVarSimplify.cpp =================================================================== --- lib/Transforms/Scalar/IndVarSimplify.cpp +++ lib/Transforms/Scalar/IndVarSimplify.cpp @@ -896,6 +896,10 @@ SmallPtrSet Widened; SmallVector NarrowIVUsers; + // A map tracking the kind of extend used to widen each narrow IV or IV user. + // Key: a narrow IV or IV user. + // Value: whether this instruction is widened by sext. + DenseMap ExtendKindMap; public: WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, @@ -926,9 +930,11 @@ const SCEVAddRecExpr *WideAR); Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU); - const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse); + const SCEVAddRecExpr *getWideRecurrence(NarrowIVDefUse DU, + bool &WidenBySext); - const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU); + const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU, + bool &WidenBySext); const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const; @@ -937,6 +943,10 @@ bool widenLoopCompare(NarrowIVDefUse DU); + bool canWidenBySext(NarrowIVDefUse DU); + + bool canWidenByZext(NarrowIVDefUse DU); + void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace @@ -1005,11 +1015,11 @@ Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef : createExtendInst(NarrowUse->getOperand(0), WideType, - IsSigned, NarrowUse); + ExtendKindMap[NarrowDef], NarrowUse); Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef : createExtendInst(NarrowUse->getOperand(1), WideType, - IsSigned, NarrowUse); + ExtendKindMap[NarrowDef], NarrowUse); auto *NarrowBO = cast(NarrowUse); auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, @@ -1086,7 +1096,7 @@ return WideUse == WideAR; }; - bool SignExtend = IsSigned; + bool SignExtend = ExtendKindMap[NarrowDef]; if (!GuessNonIVOperand(SignExtend)) { SignExtend = !SignExtend; if (!GuessNonIVOperand(SignExtend)) @@ -1128,7 +1138,8 @@ /// operands. Generate the SCEV value for the widened operation without /// actually modifying the IR yet. If the expression after extending the /// operands is an AddRec for this loop, return it. -const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) { +const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU, + bool &WidenBySext) { // Handle the common case of add const unsigned OpCode = DU.NarrowUse->getOpcode(); @@ -1146,10 +1157,10 @@ const SCEV *ExtendOperExpr = nullptr; const OverflowingBinaryOperator *OBO = cast(DU.NarrowUse); - if (IsSigned && OBO->hasNoSignedWrap()) + if (ExtendKindMap[DU.NarrowDef] && OBO->hasNoSignedWrap()) ExtendOperExpr = SE->getSignExtendExpr( SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); - else if(!IsSigned && OBO->hasNoUnsignedWrap()) + else if(!ExtendKindMap[DU.NarrowDef] && OBO->hasNoUnsignedWrap()) ExtendOperExpr = SE->getZeroExtendExpr( SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); else @@ -1172,6 +1183,8 @@ if (!AddRec || AddRec->getLoop() != L) return nullptr; + + WidenBySext = ExtendKindMap[DU.NarrowDef]; return AddRec; } @@ -1179,11 +1192,12 @@ /// widening it's type? In other words, can the extend be safely hoisted out of /// the loop with SCEV reducing the value to a recurrence on the same loop. If /// so, return the sign or zero extended recurrence. Otherwise return NULL. -const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) { - if (!SE->isSCEVable(NarrowUse->getType())) +const SCEVAddRecExpr *WidenIV::getWideRecurrence(NarrowIVDefUse DU, + bool &WidenBySext) { + if (!SE->isSCEVable(DU.NarrowUse->getType())) return nullptr; - const SCEV *NarrowExpr = SE->getSCEV(NarrowUse); + const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse); if (SE->getTypeSizeInBits(NarrowExpr->getType()) >= SE->getTypeSizeInBits(WideType)) { // NarrowUse implicitly widens its operand. e.g. a gep with a narrow @@ -1191,9 +1205,24 @@ return nullptr; } - const SCEV *WideExpr = IsSigned ? - SE->getSignExtendExpr(NarrowExpr, WideType) : - SE->getZeroExtendExpr(NarrowExpr, WideType); + const SCEV *WideExpr; + if (DU.NeverNegative) { + WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType); + if (isa(WideExpr)) + WidenBySext = true; + else { + WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType); + WidenBySext = false; + } + } + else if (ExtendKindMap[DU.NarrowDef]) { + WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType); + WidenBySext = true; + } + else { + WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType); + WidenBySext = false; + } const SCEVAddRecExpr *AddRec = dyn_cast(WideExpr); if (!AddRec || AddRec->getLoop() != L) return nullptr; @@ -1234,7 +1263,7 @@ // (A) == icmp slt i32 sext(%narrow), sext(%val) // == icmp slt i32 zext(%narrow), sext(%val) - if (!(DU.NeverNegative || IsSigned == Cmp->isSigned())) + if (!(DU.NeverNegative || ExtendKindMap[DU.NarrowDef] == Cmp->isSigned())) return false; Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); @@ -1258,7 +1287,7 @@ /// Determine whether an individual user of the narrow IV can be widened. If so, /// return the wide clone of the user. Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { - + // Stop traversing the def-use chain at inner-loop phis or post-loop phis. if (PHINode *UsePhi = dyn_cast(DU.NarrowUse)) { if (LI->getLoopFor(UsePhi->getParent()) != L) { @@ -1289,7 +1318,8 @@ } } // Our raison d'etre! Eliminate sign and zero extension. - if (IsSigned ? isa(DU.NarrowUse) : isa(DU.NarrowUse)) { + if ((isa(DU.NarrowUse) && canWidenBySext(DU)) || + (isa(DU.NarrowUse) && canWidenByZext(DU))) { Value *NewDef = DU.WideDef; if (DU.NarrowUse->getType() != WideType) { unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType()); @@ -1325,11 +1355,12 @@ // No further widening is needed. The deceased [sz]ext had done it for us. return nullptr; } - + + bool WidenBySext; // Does this user itself evaluate to a recurrence after widening? - const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse); + const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU, WidenBySext); if (!WideAddRec) - WideAddRec = getExtendedOperandRecurrence(DU); + WideAddRec = getExtendedOperandRecurrence(DU, WidenBySext); if (!WideAddRec) { // If use is a loop condition, try to promote the condition instead of @@ -1370,10 +1401,19 @@ return nullptr; } + ExtendKindMap[DU.NarrowUse] = WidenBySext; // Returning WideUse pushes it on the worklist. return WideUse; } +bool WidenIV::canWidenBySext(NarrowIVDefUse DU) { + return DU.NeverNegative || ExtendKindMap[DU.NarrowDef]; +} + +bool WidenIV::canWidenByZext(NarrowIVDefUse DU) { + return DU.NeverNegative || !ExtendKindMap[DU.NarrowDef]; +} + /// Add eligible users of NarrowDef to NarrowIVUsers. /// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { @@ -1453,6 +1493,7 @@ Widened.insert(OrigPhi); pushNarrowIVUsers(OrigPhi, WidePhi); + ExtendKindMap[OrigPhi] = IsSigned; while (!NarrowIVUsers.empty()) { NarrowIVDefUse DU = NarrowIVUsers.pop_back_val(); Index: test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll =================================================================== --- test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll +++ test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll @@ -0,0 +1,203 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -indvars -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-p:64:64:64-n8:16:32:64-S128" + +; When widening IV and its users, trunc and zext/sext are not needed +; if the original 32-bit user is known to be non-negative, whether +; the IV is considered signed or unsigned. +define void @foo(i32* %A, i32* %B, i32* %C, i32 %N) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, %N +; CHECK-NEXT: br i1 [[CMP1]], label %for.body.lr.ph, label %for.end +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label %for.body +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV:%.*]].next, %for.inc ], [ 0, %for.body.lr.ph ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %B, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* %C, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label %for.inc +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 %N to i64 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %for.body, label %for.cond.for.end_crit_edge +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: br label %for.end +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp slt i32 0, %N + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %idxprom = sext i32 %i.02 to i64 + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %i.02, 2 + %idxprom1 = zext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %0, %1 + %idxprom4 = zext i32 %i.02 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %idxprom4 + store i32 %add3, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %N + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} + +define void @foo1(i32* %A, i32* %B, i32* %C, i32 %N) { +; CHECK-LABEL: @foo1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, %N +; CHECK-NEXT: br i1 [[CMP1]], label %for.body.lr.ph, label %for.end +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label %for.body +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV:%.*]].next, %for.inc ], [ 0, %for.body.lr.ph ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %B, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* %C, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label %for.inc +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 %N to i64 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %for.body, label %for.cond.for.end_crit_edge +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: br label %for.end +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp slt i32 0, %N + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %idxprom = zext i32 %i.02 to i64 + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %i.02, 2 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %0, %1 + %idxprom4 = sext i32 %i.02 to i64 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %idxprom4 + store i32 %add3, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %N + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} + + +@a = common global [100 x i32] zeroinitializer, align 16 +@b = common global [100 x i32] zeroinitializer, align 16 + +define i32 @foo2(i32 %M) { +; CHECK-LABEL: @foo2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, %M +; CHECK-NEXT: br i1 [[CMP1]], label %for.body.lr.ph, label %for.end +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 %M to i64 +; CHECK-NEXT: br label %for.body +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV:%.*]].next, %for.inc ], [ 0, %for.body.lr.ph ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @b, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @a, i64 0, i64 [[TMP3]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: br label %for.inc +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 %M to i64 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %for.body, label %for.cond.for.end_crit_edge +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: br label %for.end +; CHECK: for.end: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @dummy(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([100 x i32], [100 x i32]* @b, i32 0, i32 0)) +; CHECK-NEXT: ret i32 0 +; +entry: + %cmp1 = icmp slt i32 0, %M + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %idxprom = zext i32 %i.02 to i64 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @a, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %idxprom1 = sext i32 %i.02 to i64 + %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* @b, i64 0, i64 %idxprom1 + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %0, %1 + %add3 = add nsw i32 %i.02, %M + %idxprom4 = sext i32 %add3 to i64 + %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* @a, i64 0, i64 %idxprom4 + store i32 %add, i32* %arrayidx5, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %M + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + %call = call i32 @dummy(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([100 x i32], [100 x i32]* @b, i32 0, i32 0)) + ret i32 0 +} + +declare i32 @dummy(i32*, i32*) +