Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -441,6 +441,9 @@ if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) return false; + LLVM_DEBUG(dbgs() << "Comparing SCEV info with IR pattern match\n"; + dbgs() << "SCEV: "; NumElements->dump(); + dbgs() << "IR: "; TripCount->dump(); ); if (TripCount != NumElements || !L->isLoopInvariant(BECount)) return false; @@ -492,7 +495,13 @@ } else return nullptr; - if (auto *RoundUp = dyn_cast(S->getLHS())) { + const SCEV *AddExpr; + if (auto *RoundUp = dyn_cast(S->getLHS())) + AddExpr = RoundUp->getOperand(0); + else + AddExpr = S->getLHS(); + + if (auto *RoundUp = dyn_cast(AddExpr)) { if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { if (Const->getAPInt() != (VF->getValue() - 1)) return nullptr; @@ -512,6 +521,10 @@ // Search for Elems in the following SCEV: // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))) /u VF)) + + LLVM_DEBUG(dbgs() << "Searching for scalar trip count in:\n"; + TripCountSE->dump()); + const SCEV *Elems = nullptr; if (auto *TC = dyn_cast(TripCountSE)) if (auto *Div = dyn_cast(TC->getOperand(1))) @@ -532,9 +545,68 @@ SCEVExpander Expander(*SE, DL, "elements"); TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); - if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) + // If the expanded NumElements expression found in the SCEV epxresion is a + // value, then we can directly match this with the IR, the vector body and + // the masked load/store instruction. This is the case for single loops, or + // nested loops with loop iterators that are independent of each other. + if (!dyn_cast(TCP.NumElements)) { + dbgs() << "it's a value: done!\n"; + if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) + return false; + return true; + } + + // Otherwise, if the expanded NumElements expression is not a value, we are + // dealing with nested-loops with dependent loop iterators, and here we + // recognise only inner loops which receive there start/stop value from the + // outer loop. For such SCEVAddRecExpr, we have to work a lot harder to match + // the scalar trip count (TC). I.e., for an inner loop, SCEV will return a + // scalar evolution expression/statement with respect to its outer loop, and + // this outer-loop value is what we extract from the SCEV expression above. + // Since this does not correspond to the trip count of the inner loop, we + // traverse this def-use chain. I.e. the use of that outerloop value, will + // be used by the instruction that sets the trip count of the inner loop. + // Thus, for a nested SCEV expression, we look one loop-nest level up for a + // define. Then, we match this value with IR patterns used by the masked + // loads/stores to check if we indeed found the scalar loop trip count. This + // will be safe because if these values don't match, we bail and don't peform + // tail-predication. + LLVM_DEBUG(dbgs() << "ARM TP: Matching scalar TC: "; TCP.NumElements->dump()); + Value *IC = nullptr; + Loop *ParentLoop = L->getParentLoop(); + while (ParentLoop && !IC) { + for (auto *U : TCP.NumElements->users()) { + LLVM_DEBUG(dbgs() << "ARM TP: Analysing user: "; U->dump();); + // If the user is not loop invariant, something is happening in the loop + // that we don't understand. + if (!L->isLoopInvariant(U)) { + LLVM_DEBUG(dbgs() << "ARM TP: user not loop invariant\n"); + return false; + } + // An use can be used by a compare and branch, and this is fine, so just + // ignore compares. + if (dyn_cast(U)) + continue; + + // While there can be several uses in the loop hierarchy, we expect the + // instruction that sets the trip count and is a user to be in the parent + // loop. + if (ParentLoop->contains(dyn_cast(U))) { + LLVM_DEBUG(dbgs() << "ARM TP: Set as scalar TC: "; U->dump()); + IC = U; + break; + } + } + ParentLoop = ParentLoop->getParentLoop(); + } + + if (!IC) return false; + LLVM_DEBUG(dbgs() << "ARM TP: FoundNumElements: "; IC->dump()); + TCP.NumElements = IC; + if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) + return false; return true; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll @@ -0,0 +1,564 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s + + +; This IR corresponds to a 2d loop, where the inner loop receives its stop +; value from the outer loop and thus depends on the outer loop: +; +; for (i = 0; i < N; i++) +; M = Size - i; +; for (j = 0; j < M; j++) +; // reduction +; +define dso_local void @SCEVAddRecExpr_2d(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_2d( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], -4 +; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -4 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[CONV2]], [[I_037]] +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]] +; CHECK: vector.ph: +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP16]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>* +; CHECK-NEXT: [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>* +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21]] = sub i32 [[TMP19]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]] +; CHECK-NEXT: [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4 +; CHECK-NEXT: [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1) +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +; CHECK-NEXT: br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]]) +; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]]) +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP34]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]] +; CHECK: for.end17: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i16 %N to i32 + %cmp36 = icmp sgt i16 %N, 0 + br i1 %cmp36, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: ; preds = %entry + %conv2 = sext i16 %Size to i32 + %conv1032 = zext i16 %Scale to i32 + %0 = add i32 %conv2, 3 + br label %for.body + +for.body: ; preds = %for.end, %for.body.lr.ph + %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ] + %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ] + %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] + %1 = mul nsw i32 %i.037, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv2, %i.037 + %cmp433 = icmp slt i32 %i.037, %conv2 + br i1 %cmp433, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ] + %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>* + %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef) + %13 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef) + %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32> + %15 = mul nsw <4 x i32> %14, %13 + %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0 + %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer + %18 = ashr <4 x i32> %15, %17 + %19 = add <4 x i32> %18, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4 + %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer + %24 = icmp ule <4 x i32> %induction, %23 + %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi + %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ] + %27 = lshr i32 %Sum.0.lcssa, 16 + %conv13 = trunc i32 %27 to i16 + %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037 + store i16 %conv13, i16* %arrayidx14, align 2 + %inc16 = add nuw nsw i32 %i.037, 1 + %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1 + %lsr.iv.next = add i32 %lsr.iv53, -1 + %exitcond39 = icmp eq i32 %inc16, %conv + br i1 %exitcond39, label %for.end17, label %for.body + +for.end17: ; preds = %for.end, %entry + ret void +} + +; This is the almost the same as SCEVAddRecExpr_2d, except that a loop invariant +; statement has been added to vector.body. +; +define dso_local void @SCEVAddRecExpr_2d_not_invariant(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_2d_not_invariant( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], -4 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i32 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[CONV2]], [[I_037]] +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP7]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[CONV2_PHI:%.*]] = phi i32 [ [[CONV2]], [[VECTOR_PH]] ], [ [[CONV2_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[CONV2_NEXT]] = add i32 [[CONV2_PHI]], 4 +; CHECK-NEXT: [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>* +; CHECK-NEXT: [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>* +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <4 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19]] = add <4 x i32> [[TMP18]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4 +; CHECK-NEXT: [[TMP20]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP9]], i32 1) +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[TMP19]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP27]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]] +; CHECK: for.end17: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i16 %N to i32 + %cmp36 = icmp sgt i16 %N, 0 + br i1 %cmp36, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: ; preds = %entry + %conv2 = sext i16 %Size to i32 + %conv1032 = zext i16 %Scale to i32 + %0 = add i32 %conv2, 3 + br label %for.body + +for.body: ; preds = %for.end, %for.body.lr.ph + %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ] + %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ] + %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] + %1 = mul nsw i32 %i.037, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv2, %i.037 + %cmp433 = icmp slt i32 %i.037, %conv2 + br i1 %cmp433, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ] + +; Loop invariant statement added here: + + %conv2.phi = phi i32 [ %conv2, %vector.ph ], [ %conv2.next, %vector.body ] + %conv2.next = add i32 %conv2.phi, 4 + + %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>* + %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef) + %13 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef) + %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32> + %15 = mul nsw <4 x i32> %14, %13 + %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0 + %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer + %18 = ashr <4 x i32> %15, %17 + %19 = add <4 x i32> %18, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4 + %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer + %24 = icmp ule <4 x i32> %induction, %23 + %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi + %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ] + %27 = lshr i32 %Sum.0.lcssa, 16 + %conv13 = trunc i32 %27 to i16 + %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037 + store i16 %conv13, i16* %arrayidx14, align 2 + %inc16 = add nuw nsw i32 %i.037, 1 + %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1 + %lsr.iv.next = add i32 %lsr.iv53, -1 + %exitcond39 = icmp eq i32 %inc16, %conv + br i1 %exitcond39, label %for.end17, label %for.body + +for.end17: ; preds = %for.end, %entry + ret void +} + + +; This IR corresponds to this 3d loop: +; +; for (k = 0; k < N; k++) +; for (i = 0; i < N; i++) +; M = Size - i; +; for (j = 0; j < M; j++) +; // reduction +; +; Inner loop j depends on its outerloop i, but not on its most outerloop k. +; +define dso_local void @SCEVAddRecExpr_3d(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_3d( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP52:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP52]], label [[FOR_COND2_PREHEADER_LR_PH:%.*]], label [[FOR_END26:%.*]] +; CHECK: for.cond2.preheader.lr.ph: +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[CONV1645:%.*]] = zext i16 [[SCALE:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV7]], 3 +; CHECK-NEXT: br label [[FOR_COND2_PREHEADER_US:%.*]] +; CHECK: for.cond2.preheader.us: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US:%.*]] ], [ [[INPUT:%.*]], [[FOR_COND2_PREHEADER_LR_PH]] ] +; CHECK-NEXT: [[K_053_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_LR_PH]] ], [ [[INC25_US:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]] ] +; CHECK-NEXT: br label [[FOR_BODY6_US:%.*]] +; CHECK: for.body6.us: +; CHECK-NEXT: [[LSR_IV72:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END_US:%.*]] ], [ [[TMP0]], [[FOR_COND2_PREHEADER_US]] ] +; CHECK-NEXT: [[LSR_IV67:%.*]] = phi i16* [ [[SCEVGEP68:%.*]], [[FOR_END_US]] ], [ [[INPUT]], [[FOR_COND2_PREHEADER_US]] ] +; CHECK-NEXT: [[I_050_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_US]] ], [ [[INC22_US:%.*]], [[FOR_END_US]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[I_050_US]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV7]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], -4 +; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[I_050_US]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -4 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[CONV7]], [[I_050_US]] +; CHECK-NEXT: [[CMP946_US:%.*]] = icmp slt i32 [[I_050_US]], [[CONV7]] +; CHECK-NEXT: br i1 [[CMP946_US]], label [[VECTOR_PH:%.*]], label [[FOR_END_US]] +; CHECK: vector.ph: +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP16]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV69:%.*]] = phi i16* [ [[SCEVGEP70:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV67]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV64:%.*]] = phi i16* [ [[SCEVGEP65:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[LSR_IV6971:%.*]] = bitcast i16* [[LSR_IV69]] to <4 x i16>* +; CHECK-NEXT: [[LSR_IV6466:%.*]] = bitcast i16* [[LSR_IV64]] to <4 x i16>* +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21]] = sub i32 [[TMP19]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6466]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD59:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6971]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD59]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1645]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]] +; CHECK-NEXT: [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP65]] = getelementptr i16, i16* [[LSR_IV64]], i32 4 +; CHECK-NEXT: [[SCEVGEP70]] = getelementptr i16, i16* [[LSR_IV69]], i32 4 +; CHECK-NEXT: [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1) +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +; CHECK-NEXT: br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]]) +; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]]) +; CHECK-NEXT: br label [[FOR_END_US]] +; CHECK: for.end.us: +; CHECK-NEXT: [[SUM_0_LCSSA_US:%.*]] = phi i32 [ 0, [[FOR_BODY6_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA_US]], 16 +; CHECK-NEXT: [[CONV19_US:%.*]] = trunc i32 [[TMP34]] to i16 +; CHECK-NEXT: [[ARRAYIDX20_US:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_050_US]] +; CHECK-NEXT: store i16 [[CONV19_US]], i16* [[ARRAYIDX20_US]], align 2 +; CHECK-NEXT: [[INC22_US]] = add nuw nsw i32 [[I_050_US]], 1 +; CHECK-NEXT: [[SCEVGEP68]] = getelementptr i16, i16* [[LSR_IV67]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV72]], -1 +; CHECK-NEXT: [[EXITCOND55:%.*]] = icmp eq i32 [[INC22_US]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND55]], label [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]], label [[FOR_BODY6_US]] +; CHECK: for.cond2.for.inc24_crit_edge.us: +; CHECK-NEXT: [[INC25_US]] = add nuw nsw i32 [[K_053_US]], 1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 1 +; CHECK-NEXT: [[EXITCOND56:%.*]] = icmp eq i32 [[INC25_US]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND56]], label [[FOR_END26]], label [[FOR_COND2_PREHEADER_US]] +; CHECK: for.end26: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i16 %N to i32 + %cmp52 = icmp sgt i16 %N, 0 + br i1 %cmp52, label %for.cond2.preheader.lr.ph, label %for.end26 + +for.cond2.preheader.lr.ph: ; preds = %entry + %conv7 = sext i16 %Size to i32 + %conv1645 = zext i16 %Scale to i32 + %0 = add i32 %conv7, 3 + br label %for.cond2.preheader.us + +for.cond2.preheader.us: ; preds = %for.cond2.for.inc24_crit_edge.us, %for.cond2.preheader.lr.ph + %lsr.iv = phi i16* [ %scevgep, %for.cond2.for.inc24_crit_edge.us ], [ %Input, %for.cond2.preheader.lr.ph ] + %k.053.us = phi i32 [ 0, %for.cond2.preheader.lr.ph ], [ %inc25.us, %for.cond2.for.inc24_crit_edge.us ] + br label %for.body6.us + +for.body6.us: ; preds = %for.end.us, %for.cond2.preheader.us + %lsr.iv72 = phi i32 [ %lsr.iv.next, %for.end.us ], [ %0, %for.cond2.preheader.us ] + %lsr.iv67 = phi i16* [ %scevgep68, %for.end.us ], [ %Input, %for.cond2.preheader.us ] + %i.050.us = phi i32 [ 0, %for.cond2.preheader.us ], [ %inc22.us, %for.end.us ] + %1 = mul nsw i32 %i.050.us, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv7, %i.050.us + %cmp946.us = icmp slt i32 %i.050.us, %conv7 + br i1 %cmp946.us, label %vector.ph, label %for.end.us + +vector.ph: ; preds = %for.body6.us + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv69 = phi i16* [ %scevgep70, %vector.body ], [ %lsr.iv67, %vector.ph ] + %lsr.iv64 = phi i16* [ %scevgep65, %vector.body ], [ %lsr.iv, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ] + %lsr.iv6971 = bitcast i16* %lsr.iv69 to <4 x i16>* + %lsr.iv6466 = bitcast i16* %lsr.iv64 to <4 x i16>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6466, i32 2, <4 x i1> %12, <4 x i16> undef) + %13 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load59 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6971, i32 2, <4 x i1> %12, <4 x i16> undef) + %14 = sext <4 x i16> %wide.masked.load59 to <4 x i32> + %15 = mul nsw <4 x i32> %14, %13 + %16 = insertelement <4 x i32> undef, i32 %conv1645, i32 0 + %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer + %18 = ashr <4 x i32> %15, %17 + %19 = add <4 x i32> %18, %vec.phi + %index.next = add i32 %index, 4 + %scevgep65 = getelementptr i16, i16* %lsr.iv64, i32 4 + %scevgep70 = getelementptr i16, i16* %lsr.iv69, i32 4 + %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer + %24 = icmp ule <4 x i32> %induction, %23 + %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi + %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25) + br label %for.end.us + +for.end.us: ; preds = %middle.block, %for.body6.us + %Sum.0.lcssa.us = phi i32 [ 0, %for.body6.us ], [ %26, %middle.block ] + %27 = lshr i32 %Sum.0.lcssa.us, 16 + %conv19.us = trunc i32 %27 to i16 + %arrayidx20.us = getelementptr inbounds i16, i16* %Output, i32 %i.050.us + store i16 %conv19.us, i16* %arrayidx20.us, align 2 + %inc22.us = add nuw nsw i32 %i.050.us, 1 + %scevgep68 = getelementptr i16, i16* %lsr.iv67, i32 1 + %lsr.iv.next = add i32 %lsr.iv72, -1 + %exitcond55 = icmp eq i32 %inc22.us, %conv + br i1 %exitcond55, label %for.cond2.for.inc24_crit_edge.us, label %for.body6.us + +for.cond2.for.inc24_crit_edge.us: ; preds = %for.end.us + %inc25.us = add nuw nsw i32 %k.053.us, 1 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 1 + %exitcond56 = icmp eq i32 %inc25.us, %conv + br i1 %exitcond56, label %for.end26, label %for.cond2.preheader.us + +for.end26: ; preds = %for.cond2.for.inc24_crit_edge.us, %entry + ret void +} + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>) +declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)