Index: llvm/lib/Transforms/Scalar/LoopFlatten.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -84,6 +84,9 @@ SmallPtrSet LinearIVUses; SmallPtrSet InnerPHIsToTransform; + // Whether this holds the flatten info before or after widening. + bool Widened = false; + FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {}; }; @@ -335,8 +338,9 @@ // transformation wouldn't be profitable. Value *InnerLimit = FI.InnerLimit; - if (auto *I = dyn_cast(InnerLimit)) - InnerLimit = I->getOperand(0); + if (FI.Widened && + (isa(InnerLimit) || isa(InnerLimit))) + InnerLimit = cast(InnerLimit)->getOperand(0); // Check that all uses of the inner loop's induction variable match the // expected pattern, recording the uses of the outer IV. @@ -347,7 +351,7 @@ // After widening the IVs, a trunc instruction might have been introduced, so // look through truncs. - if (dyn_cast(U) ) { + if (dyn_cast(U)) { if (!U->hasOneUse()) return false; U = *U->user_begin(); @@ -544,9 +548,9 @@ BranchInst::Create(InnerExitBlock, InnerExitingBlock); DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader()); - auto HasSExtUser = [] (Value *V) -> Value * { + auto HasSZExtUser = [] (Value *V) -> Value * { for (User *U : V->users() ) - if (dyn_cast(U)) + if (dyn_cast(U) || dyn_cast(U)) return U; return nullptr; }; @@ -554,9 +558,22 @@ // Replace all uses of the polynomial calculated from the two induction // variables with the one new one. for (Value *V : FI.LinearIVUses) { - // If the induction variable has been widened, look through the SExt. - if (Value *U = HasSExtUser(V)) - V = U; + if (FI.Widened) { + // After widening, we have this pattern: + // + // %indvar = phi i64 [ .. ] + // %3 = trunc i64 %indvar to i32 + // %add.us = add i32 %3, %mul.us + // %idxprom.us = zext i32 %add.us to i64 + // + // The linear IV user is %add.us which is a i32 value. We look through + // the Sign/Zero extend, to find %idxprom.us which is a i64 value, which + // will be replaced by OuterInductionPHI, another i64 value after widening. + if (Value *U = HasSZExtUser(V)) + V = U; + } + LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); + dbgs() << "with: "; FI.OuterInductionPHI->dump()); V->replaceAllUsesWith(FI.OuterInductionPHI); } @@ -613,6 +630,8 @@ RecursivelyDeleteDeadPHINode(WideIVs[i].NarrowIV); } // After widening, rediscover all the loop components. + assert(Widened && "Widenend IV expected"); + FI.Widened = true; return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI); } Index: llvm/test/Transforms/LoopFlatten/widen-iv.ll =================================================================== --- llvm/test/Transforms/LoopFlatten/widen-iv.ll +++ llvm/test/Transforms/LoopFlatten/widen-iv.ll @@ -108,4 +108,108 @@ ret void } +define void @zext(i32 %N, i16* nocapture %A, i16 %val) { +; CHECK-LABEL: @zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP20_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP20_NOT]], label [[FOR_END9:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_INC7_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32 +; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[TMP2]], [[N]] +; CHECK-NEXT: br label [[FOR_BODY3_US:%.*]] +; CHECK: for.body3.us: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32 +; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP3]], [[MUL_US]] +; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 [[INDVAR1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX_US]], align 2 +; CHECK-NEXT: [[ADD5_US:%.*]] = add i16 [[TMP4]], [[VAL:%.*]] +; CHECK-NEXT: store i16 [[ADD5_US]], i16* [[ARRAYIDX_US]], align 2 +; CHECK-NEXT: [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]] +; CHECK-NEXT: br label [[FOR_COND1_FOR_INC7_CRIT_EDGE_US]] +; CHECK: for.cond1.for.inc7_crit_edge.us: +; CHECK-NEXT: [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1 +; CHECK-NEXT: [[CMP_US:%.*]] = icmp ult i64 [[INDVAR_NEXT2]], [[FLATTEN_TRIPCOUNT]] +; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END9_LOOPEXIT:%.*]] +; CHECK: for.end9.loopexit: +; CHECK-NEXT: br label [[FOR_END9]] +; CHECK: for.end9: +; CHECK-NEXT: ret void +; +; DONTWIDEN-LABEL: @zext( +; DONTWIDEN-NEXT: entry: +; DONTWIDEN-NEXT: [[CMP20_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 +; DONTWIDEN-NEXT: br i1 [[CMP20_NOT]], label [[FOR_END9:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; DONTWIDEN: for.cond1.preheader.us.preheader: +; DONTWIDEN-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; DONTWIDEN: for.cond1.preheader.us: +; DONTWIDEN-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC8_US:%.*]], [[FOR_COND1_FOR_INC7_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; DONTWIDEN-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]] +; DONTWIDEN-NEXT: br label [[FOR_BODY3_US:%.*]] +; DONTWIDEN: for.body3.us: +; DONTWIDEN-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY3_US]] ] +; DONTWIDEN-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]] +; DONTWIDEN-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64 +; DONTWIDEN-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 [[IDXPROM_US]] +; DONTWIDEN-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX_US]], align 2 +; DONTWIDEN-NEXT: [[ADD5_US:%.*]] = add i16 [[TMP0]], [[VAL:%.*]] +; DONTWIDEN-NEXT: store i16 [[ADD5_US]], i16* [[ARRAYIDX_US]], align 2 +; DONTWIDEN-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1 +; DONTWIDEN-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[N]] +; DONTWIDEN-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY3_US]], label [[FOR_COND1_FOR_INC7_CRIT_EDGE_US]] +; DONTWIDEN: for.cond1.for.inc7_crit_edge.us: +; DONTWIDEN-NEXT: [[INC8_US]] = add i32 [[I_021_US]], 1 +; DONTWIDEN-NEXT: [[CMP_US:%.*]] = icmp ult i32 [[INC8_US]], [[N]] +; DONTWIDEN-NEXT: br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END9_LOOPEXIT:%.*]] +; DONTWIDEN: for.end9.loopexit: +; DONTWIDEN-NEXT: br label [[FOR_END9]] +; DONTWIDEN: for.end9: +; DONTWIDEN-NEXT: ret void +; +entry: + %cmp20.not = icmp eq i32 %N, 0 + br i1 %cmp20.not, label %for.end9, label %for.cond1.preheader.us.preheader + +for.cond1.preheader.us.preheader: + br label %for.cond1.preheader.us + +for.cond1.preheader.us: + %i.021.us = phi i32 [ %inc8.us, %for.cond1.for.inc7_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] + %mul.us = mul i32 %i.021.us, %N + br label %for.body3.us + +for.body3.us: + %j.019.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body3.us ] + %add.us = add i32 %j.019.us, %mul.us + %idxprom.us = zext i32 %add.us to i64 + %arrayidx.us = getelementptr inbounds i16, i16* %A, i64 %idxprom.us + %0 = load i16, i16* %arrayidx.us, align 2 + %add5.us = add i16 %0, %val + store i16 %add5.us, i16* %arrayidx.us, align 2 + %inc.us = add nuw i32 %j.019.us, 1 + %cmp2.us = icmp ult i32 %inc.us, %N + br i1 %cmp2.us, label %for.body3.us, label %for.cond1.for.inc7_crit_edge.us + +for.cond1.for.inc7_crit_edge.us: + %inc8.us = add i32 %i.021.us, 1 + %cmp.us = icmp ult i32 %inc8.us, %N + br i1 %cmp.us, label %for.cond1.preheader.us, label %for.end9.loopexit + +for.end9.loopexit: + br label %for.end9 + +for.end9: + ret void +} + + + declare dso_local void @f(i32* %0) local_unnamed_addr #1