diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -135,6 +135,9 @@ AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true), cl::desc("Allow widening of indvars to eliminate s/zext")); +static cl::opt +EnableSimplifyTC("enable-simplify-trip-count", cl::Hidden, cl::init(true), + cl::desc("Transform trip count into simpler forms.")); namespace { class IndVarSimplify { @@ -171,6 +174,8 @@ bool sinkUnusedInvariants(Loop *L); + bool simplifyTripCount(Loop *L, ScalarEvolution *SE); + public: IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, const DataLayout &DL, TargetLibraryInfo *TLI, @@ -2002,6 +2007,148 @@ return Changed; } +// Original loop: +// \code +// loop.preheader: +// <...> +// %add = add nsw i32 %IVInitial, %tripcount +// br %loop +// +// loop: +// %indvars.iv = phi[{%IVInitial,%loop.preheader},{%indvars.iv.next, %loop}] +// %gep = getelementptr inbounds double, double* %values, i64 %indvars.iv +// <...> +// %indvars.iv.next = %indvars.iv + %step +// %cmp = icmp slt i64 %indvars.iv.next, %add +// <...> +// \endcode +// +// Transform it into: +// \code +// loop.preheader: +// <...> +// %add = add nsw i32 %IVInitial, %tripcount +// %tripcount = sub nsw i32 %add, %IVInitial (can be simplified) +// br %loop +// +// loop: +// %indvars.iv.new = phi[{0,%loop.preheader},{%indvars.iv.new.next, %loop}] +// %iv.offset = add i32 %indvars.iv.new, %IVInitial +// %gep = getelementptr inbounds double, double* %values, i64 %iv.offset +// <...> +// %indvars.iv.new.next = %indvars.iv.new + %step +// %cmp = icmp slt i64 %indvars.iv.new.next, %tripcount +// <...> +// \endcode +bool IndVarSimplify::simplifyTripCount(Loop *L, ScalarEvolution *SE) { + // Only optimize inner loops currently. + if (!L->getSubLoops().empty()) + return false; + + // Only optimize simple loops. + if (L->getLoopLatch() != L->getHeader()) + return false; + + // Only optimize loops with induction variable. + PHINode *IV = L->getInductionVariable(*SE); + if (!IV) + return false; + + // Detach the trip count. + // case1: + // iv.next = iv + step + // LatchCmp = iv.next < tripcount + // + // case2: + // iv.next = iv + step + // LatchCmp = tripcount > iv.next + Value *TC = nullptr, *IVNext = nullptr; + auto *LatchCmp = L->getLatchCmpInst(); + Value *LatchCmpOp0 = LatchCmp->getOperand(0); + Value *LatchCmpOp1 = LatchCmp->getOperand(1); + if (L->isLoopInvariant(LatchCmpOp0)) { + TC = LatchCmpOp0; + IVNext = LatchCmpOp1; + } else { + TC = LatchCmpOp1; + IVNext = LatchCmpOp0; + } + + // Strip sext or zext to get the real trip count. + // %TC = sext i32 %TCValue to i64 or + // %TC = zext i32 %TCValue to i64 + Value *TCValue = nullptr; + auto *TCInst = dyn_cast(TC); + if (TCInst && (isa(TCInst) || isa(TCInst))) { + TCValue = TCInst->getOperand(0); + } else { + TCValue = TC; + } + + // Strip sext or zext to get the initial value of induction variable. + // %IVInitial = sext i32 %IVInitialValue to i64 + // %IVInitial = zext i32 %IVInitialValue to i64 + // %iv = phi[{%IVInitial, loop.preheader}, {%iv.next, latch}] + Value *IVInitial = IV->getIncomingValueForBlock(L->getLoopPreheader()); + Value *IVInitialValue = nullptr; + auto *IVInitialInst = dyn_cast(IVInitial); + if (IVInitialInst && + (isa(IVInitialInst) || isa(IVInitialInst))) + IVInitialValue = IVInitialInst->getOperand(0); + else + IVInitialValue = IVInitialInst; + + // We only handle cases of the form: + // %TCValue = %IVInitialValue + %TCNew (IV + step < TCValue) + // or + // %TCValue = %IVInitialValue - %TCNew (IV + (-step) > TCValue) + auto *TCValueInst = dyn_cast(TCValue); + if (!TCValueInst || !(TCValueInst->getOpcode() == Instruction::Add || + TCValueInst->getOpcode() == Instruction::Sub)) + return false; + + if (!(TCValueInst->getOperand(0) == IVInitialValue || + TCValueInst->getOperand(1) == IVInitialValue)) + return false; + + // Filter out the form: %TCValue = %TCNew - %IVInitialValue + if (TCValueInst->getOpcode() == Instruction::Sub && + TCValueInst->getOperand(0) != IVInitialValue) + return false; + + auto *IVNextInst = cast(IVNext); + Value *Step = nullptr; + if (IVNextInst->getOperand(1) == IV) + Step = IVNextInst->getOperand(0); + else + Step = IVNextInst->getOperand(1); + + IRBuilder<> Builder(TCValueInst->getParent()->getTerminator()); + // Offset the trip count. + Value *TCNew = Builder.CreateSub(TC, IVInitial); + + // Replace the old induction variable and update the condition. + Builder.SetInsertPoint(IV); + PHINode *NewIV = Builder.CreatePHI(TCNew->getType(), IV->getNumOperands(), + "indvars.iv.new"); + auto *OldBr = L->getLoopLatch()->getTerminator(); + Builder.SetInsertPoint(OldBr); + Value *Next = Builder.CreateAdd(NewIV, Step, "indvars.iv.new.next"); + NewIV->addIncoming(ConstantInt::get(TCNew->getType(), 0), + L->getLoopPreheader()); + NewIV->addIncoming(Next, L->getLoopLatch()); + Value *ICmp = Builder.CreateICmpEQ(Next, TCNew); + Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), L->getLoopLatch()); + OldBr->eraseFromParent(); + + // Offset IV to keep the logic consistent with before optimization. + Builder.SetInsertPoint(NewIV->getParent()->getFirstNonPHI()); + Value *IVOffset = Builder.CreateAdd(NewIV, IVInitial, "iv.offset"); + IV->replaceAllUsesWith(IVOffset); + IV->eraseFromParent(); + return true; +} + //===----------------------------------------------------------------------===// // IndVarSimplify driver. Manage several subpasses of IV simplification. //===----------------------------------------------------------------------===// @@ -2044,6 +2191,13 @@ Rewriter.setDebugType(DEBUG_TYPE); #endif + // Simplify trip count expression. + if (EnableSimplifyTC && simplifyTripCount(L, SE)) { + Changed = true; + // Given we've changed exit counts and induction variable, notify SCEV + SE->forgetLoop(L); + } + // Eliminate redundant IV users. // // Simplification works best when run before other consumers of SCEV. We diff --git a/llvm/test/Transforms/IndVarSimplify/simplify-tripcount.ll b/llvm/test/Transforms/IndVarSimplify/simplify-tripcount.ll --- a/llvm/test/Transforms/IndVarSimplify/simplify-tripcount.ll +++ b/llvm/test/Transforms/IndVarSimplify/simplify-tripcount.ll @@ -29,6 +29,7 @@ ; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.preheader1: ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 @@ -37,64 +38,58 @@ ; CHECK: for.cond1.preheader.us: ; CHECK-NEXT: [[INDVARS_IV39_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT40_US:%.*]], [[FOR_COND_CLEANUP3_LOOPEXIT_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: [[INDVARS_IV_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_US:%.*]], [[FOR_COND_CLEANUP3_LOOPEXIT_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDVARS_IV_US]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDVARS_IV_US]], 1 -; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 [[TMP5]]) -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDVARS_IV39_US]], [[TMP0]] -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[SMAX]], [[TMP6]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_US]] = add nsw i64 [[INDVARS_IV_US]], [[TMP0]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP7]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[INDVARS_IV_US]], [[N_VEC]] -; CHECK-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[INDVARS_IV_US]], [[INDEX]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[VALUES:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[IDX:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], [[TMP14]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = tail call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP15]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP16:%.*]] = fmul contract [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP17]] = fadd contract [[VEC_PHI]], [[TMP16]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[INDEX]], [[INDVARS_IV_US]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[VALUES:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[IDX:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], [[TMP11]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = tail call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP12]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP13:%.*]] = fmul contract [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP14]] = fadd contract [[VEC_PHI]], [[TMP13]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP19:%.*]] = tail call contract double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, [[TMP17]]) +; CHECK-NEXT: [[TMP16:%.*]] = tail call contract double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, [[TMP14]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3_LOOPEXIT_US]], label [[FOR_BODY4_US_PREHEADER]] ; CHECK: for.body4.us.preheader: -; CHECK-NEXT: [[INDVARS_IV36_US_PH:%.*]] = phi i64 [ [[INDVARS_IV_US]], [[FOR_COND1_PREHEADER_US]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[TEMP_VALUE_030_US_PH:%.*]] = phi double [ 0.000000e+00, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[INDVARS_IV_NEW_PH:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TEMP_VALUE_030_US_PH:%.*]] = phi double [ 0.000000e+00, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: -; CHECK-NEXT: [[INDVARS_IV36_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT37_US:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV36_US_PH]], [[FOR_BODY4_US_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV_NEW:%.*]] = phi i64 [ [[INDVARS_IV_NEW_NEXT:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV_NEW_PH]], [[FOR_BODY4_US_PREHEADER]] ] ; CHECK-NEXT: [[TEMP_VALUE_030_US:%.*]] = phi double [ [[ADD9_US:%.*]], [[FOR_BODY4_US]] ], [ [[TEMP_VALUE_030_US_PH]], [[FOR_BODY4_US_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds double, ptr [[VALUES]], i64 [[INDVARS_IV36_US]] -; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 -; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[IDX]], i64 [[INDVARS_IV36_US]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4 -; CHECK-NEXT: [[IDXPROM7_US:%.*]] = sext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[IV_OFFSET:%.*]] = add nsw i64 [[INDVARS_IV_NEW]], [[INDVARS_IV_US]] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds double, ptr [[VALUES]], i64 [[IV_OFFSET]] +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 +; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[IDX]], i64 [[IV_OFFSET]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4 +; CHECK-NEXT: [[IDXPROM7_US:%.*]] = sext i32 [[TMP18]] to i64 ; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM7_US]] -; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[ARRAYIDX8_US]], align 8 -; CHECK-NEXT: [[MUL_US:%.*]] = fmul contract double [[TMP20]], [[TMP22]] +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX8_US]], align 8 +; CHECK-NEXT: [[MUL_US:%.*]] = fmul contract double [[TMP17]], [[TMP19]] ; CHECK-NEXT: [[ADD9_US]] = fadd contract double [[TEMP_VALUE_030_US]], [[MUL_US]] -; CHECK-NEXT: [[INDVARS_IV_NEXT37_US]] = add nsw i64 [[INDVARS_IV36_US]], 1 -; CHECK-NEXT: [[CMP2_US:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT37_US]], [[INDVARS_IV_NEXT_US]] -; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND_CLEANUP3_LOOPEXIT_US]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[INDVARS_IV_NEW_NEXT]] = add nuw nsw i64 [[INDVARS_IV_NEW]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDVARS_IV_NEW_NEXT]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP20]], label [[FOR_COND_CLEANUP3_LOOPEXIT_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.cond.cleanup3.loopexit.us: -; CHECK-NEXT: [[ADD9_US_LCSSA:%.*]] = phi double [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ADD9_US]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[ADD9_US_LCSSA:%.*]] = phi double [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ [[ADD9_US]], [[FOR_BODY4_US]] ] ; CHECK-NEXT: [[ARRAYIDX12_US:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV39_US]] ; CHECK-NEXT: store double [[ADD9_US_LCSSA]], ptr [[ARRAYIDX12_US]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT40_US]] = add nuw nsw i64 [[INDVARS_IV39_US]], 1 @@ -167,84 +162,81 @@ ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NOUT]] to i64 ; CHECK-NEXT: [[CMP227:%.*]] = icmp sgt i32 [[NIN]], 0 ; CHECK-NEXT: br i1 [[CMP227]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER1:%.*]] -; CHECK: for.cond1.preheader.us.preheader: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.preheader1: -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 -; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[B:%.*]], i8 0, i64 [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[B:%.*]], i8 0, i64 [[TMP1]], i1 false) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw i64 0, [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP4]], [[TMP0]] +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; CHECK: for.cond1.preheader.us: ; CHECK-NEXT: [[INDVARS_IV37_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT38_US:%.*]], [[FOR_COND_CLEANUP3_LOOPEXIT_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] ; CHECK-NEXT: [[INDVARS_IV_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_US:%.*]], [[FOR_COND_CLEANUP3_LOOPEXIT_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[INDVARS_IV_US]], [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDVARS_IV_US]], -1 -; CHECK-NEXT: [[SMIN:%.*]] = tail call i64 @llvm.smin.i64(i64 [[TMP4]], i64 [[TMP5]]) -; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[INDVARS_IV_US]], [[SMIN]] -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i64 [[INDVARS_IV_US]], [[TMP0]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP8:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[INDVARS_IV_US]], [[N_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = sub nsw i64 0, [[N_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = sub nsw i64 1, [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = tail call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = sub nsw i64 1, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = sub nsw i64 1, [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[INDVARS_IV_US]], [[INDEX]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[VALUES:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 [[TMP12]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 8 +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[INDVARS_IV_US]], [[INDEX]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[VALUES:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 8 ; CHECK-NEXT: [[REVERSE:%.*]] = tail call @llvm.experimental.vector.reverse.nxv2f64( [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[IDX:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP15]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[IDX:%.*]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP12]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[REVERSE4:%.*]] = tail call @llvm.experimental.vector.reverse.nxv2i32( [[WIDE_LOAD3]]) -; CHECK-NEXT: [[TMP22:%.*]] = sext [[REVERSE4]] to -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], [[TMP22]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = tail call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP23]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP24:%.*]] = fmul contract [[REVERSE]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP25]] = fadd contract [[VEC_PHI]], [[TMP24]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP20:%.*]] = sext [[REVERSE4]] to +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], [[TMP20]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = tail call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP21]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP22:%.*]] = fmul contract [[REVERSE]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP23]] = fadd contract [[VEC_PHI]], [[TMP22]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP27:%.*]] = tail call contract double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, [[TMP25]]) +; CHECK-NEXT: [[TMP25:%.*]] = tail call contract double @llvm.vector.reduce.fadd.nxv2f64(double -0.000000e+00, [[TMP23]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3_LOOPEXIT_US]], label [[FOR_BODY4_US_PREHEADER]] ; CHECK: for.body4.us.preheader: -; CHECK-NEXT: [[INDVARS_IV34_US_PH:%.*]] = phi i64 [ [[INDVARS_IV_US]], [[FOR_COND1_PREHEADER_US]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[TEMP_VALUE_028_US_PH:%.*]] = phi double [ 0.000000e+00, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[INDVARS_IV_NEW_PH:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TEMP_VALUE_028_US_PH:%.*]] = phi double [ 0.000000e+00, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: -; CHECK-NEXT: [[INDVARS_IV34_US:%.*]] = phi i64 [ [[INDVARS_IV_NEXT35_US:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV34_US_PH]], [[FOR_BODY4_US_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV_NEW:%.*]] = phi i64 [ [[INDVARS_IV_NEW_NEXT:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV_NEW_PH]], [[FOR_BODY4_US_PREHEADER]] ] ; CHECK-NEXT: [[TEMP_VALUE_028_US:%.*]] = phi double [ [[ADD_US:%.*]], [[FOR_BODY4_US]] ], [ [[TEMP_VALUE_028_US_PH]], [[FOR_BODY4_US_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds double, ptr [[VALUES]], i64 [[INDVARS_IV34_US]] -; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 -; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[IDX]], i64 [[INDVARS_IV34_US]] -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4 -; CHECK-NEXT: [[IDXPROM7_US:%.*]] = sext i32 [[TMP29]] to i64 +; CHECK-NEXT: [[IV_OFFSET:%.*]] = add nsw i64 [[INDVARS_IV_NEW]], [[INDVARS_IV_US]] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds double, ptr [[VALUES]], i64 [[IV_OFFSET]] +; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 +; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[IDX]], i64 [[IV_OFFSET]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4 +; CHECK-NEXT: [[IDXPROM7_US:%.*]] = sext i32 [[TMP27]] to i64 ; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM7_US]] -; CHECK-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX8_US]], align 8 -; CHECK-NEXT: [[MUL_US:%.*]] = fmul contract double [[TMP28]], [[TMP30]] +; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr [[ARRAYIDX8_US]], align 8 +; CHECK-NEXT: [[MUL_US:%.*]] = fmul contract double [[TMP26]], [[TMP28]] ; CHECK-NEXT: [[ADD_US]] = fadd contract double [[TEMP_VALUE_028_US]], [[MUL_US]] -; CHECK-NEXT: [[INDVARS_IV_NEXT35_US]] = add nsw i64 [[INDVARS_IV34_US]], -1 -; CHECK-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[INDVARS_IV_NEXT35_US]], [[TMP7]] -; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND_CLEANUP3_LOOPEXIT_US]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[INDVARS_IV_NEW_NEXT]] = add nsw i64 [[INDVARS_IV_NEW]], -1 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDVARS_IV_NEW_NEXT]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP29]], label [[FOR_COND_CLEANUP3_LOOPEXIT_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.cond.cleanup3.loopexit.us: -; CHECK-NEXT: [[ADD_US_LCSSA:%.*]] = phi double [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ [[ADD_US]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[ADD_US_LCSSA:%.*]] = phi double [ [[TMP25]], [[MIDDLE_BLOCK]] ], [ [[ADD_US]], [[FOR_BODY4_US]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT_US]] = add nsw i64 [[INDVARS_IV_US]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX11_US:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV37_US]] ; CHECK-NEXT: store double [[ADD_US_LCSSA]], ptr [[ARRAYIDX11_US]], align 8