Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -441,6 +441,9 @@ if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) return false; + LLVM_DEBUG(dbgs() << "Comparing SCEV info with IR pattern match\n"; + dbgs() << "SCEV: "; NumElements->dump(); + dbgs() << "IR: "; TripCount->dump(); ); if (TripCount != NumElements || !L->isLoopInvariant(BECount)) return false; @@ -492,7 +495,13 @@ } else return nullptr; - if (auto *RoundUp = dyn_cast(S->getLHS())) { + const SCEV *AddExpr; + if (auto *RoundUp = dyn_cast(S->getLHS())) + AddExpr = RoundUp->getOperand(0); + else + AddExpr = S->getLHS(); + + if (auto *RoundUp = dyn_cast(AddExpr)) { if (auto *Const = dyn_cast(RoundUp->getOperand(0))) { if (Const->getAPInt() != (VF->getValue() - 1)) return nullptr; @@ -512,6 +521,10 @@ // Search for Elems in the following SCEV: // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))) /u VF)) + + LLVM_DEBUG(dbgs() << "Searching for scalar trip count in:\n"; + TripCountSE->dump()); + const SCEV *Elems = nullptr; if (auto *TC = dyn_cast(TripCountSE)) if (auto *Div = dyn_cast(TC->getOperand(1))) @@ -532,9 +545,100 @@ SCEVExpander Expander(*SE, DL, "elements"); TCP.NumElements = Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); - if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) + // After expansion NumElements can be an instruction or a value. It is the + // start from where we start the pattern matching and the traversal of its + // uses, to see where this value is used to define a value that can + // correspond to a statement that calculates the iteration count. After + // finding it, we cross check and match this with the IR, i.e. the vector + // body and the masked load/store instruction, to see that these masked/loads + // stores indeed use the iteration count. + // + // Here's an example, where inner loop j has an upper bound calculated + // by S - i and i is the outer loop iterator: + // + // void foo (..., int N, int M, int S) { + // for (i = 0; i < N; i++) { + // M = S - i; + // for (j = 0; j < M; j++) { + // + // And now the SCEV expression looks like this: + // + // (1 + ((-4 + (4 * ({(3 + %S),+,-1} /u 4))) /u 4)) + // + // The challenge here is that we have extracted %S as NumElements from the + // SCEV expression, which is a scAddRecExpr type, but this does not yet + // correspond to the iteration count of the loop. In simpler cases, when we + // have a simpler scAddExpr, the NumEmelements directly corresponds to the + // iteration count. To cover this more complicated case, we traverse the uses + // of %S, until we find a use that is: + // 1) loop-invariant, + // 2) ignore compares (they are not used to define a value) + // 3) contained in a parent loop (we want to find the last use). + // + // Here's a heavily reduced example that corresponds to the outer loop + // preheader and body blocks of the pseudo-code example above: + // + // outer.for.body.lr.ph: + // [[CONV2]] = sext i16 [[S]] to i32 + // [[TMP0]] = add i32 [[CONV2]], 3 + // br label [[FOR_BODY]] + // outer.for.body: + // [[TMP2]] = add i32 [[CONV2]], [[TMP1]] + // [[TMP17]] = sub i32 [[CONV2]], [[I_037]] + // [[CMP433]] = icmp slt i32 [[I_037]], [[CONV2]] + // br i1 [[CMP433]], label [[VECTOR_PH]], label [[FOR_END]] + // + // Variable [[S]] corresponds to %S in the SCEV expression, and used to + // define CONV2. Starting in the parent loop, outer.for.body, we look for + // uses of CONV2. Ignoring icmps, we find it is used to define [[TMP17]], + // which calculates [[CONV2]] - [[I_037]], the iteration count, and + // corresponds to M = S - i from the example. + + LLVM_DEBUG(dbgs() << "ARM TP: Matching scalar TC: "; TCP.NumElements->dump()); + Value *ScalarTC = nullptr; + Loop *ParentLoop = L->getParentLoop(); + while (ParentLoop && !ScalarTC) { + for (auto *U : TCP.NumElements->users()) { + LLVM_DEBUG(dbgs() << "ARM TP: Analysing user: "; U->dump();); + // 1) If the user is not loop invariant, something is happening in the loop + // that we don't understand. + if (!L->isLoopInvariant(U)) { + LLVM_DEBUG(dbgs() << "ARM TP: user not loop invariant\n"); + return false; + } + + // 2) An use can be used by a compare and branch, and this is fine, so just + // ignore compares. + if (dyn_cast(U)) + continue; + + // 3) While there can be several uses in the loop hierarchy, we expect the + // instruction that sets the trip count and is a user to be in the parent + // loop. + if (ParentLoop->contains(dyn_cast(U))) { + LLVM_DEBUG(dbgs() << "ARM TP: Set as scalar TC: "; U->dump()); + ScalarTC = U; + break; + } + } + ParentLoop = ParentLoop->getParentLoop(); + } + + // Now we choose NumElements. This depends if the search for a define of the + // tripcount was succesful. If not, and if it is an instruction (not a + // value), we bail and can't handle this case. But if an ScalarTC is found, we + // will use that. + if (!ScalarTC && dyn_cast(TCP.NumElements)) return false; + else if (ScalarTC) + TCP.NumElements = ScalarTC; + // Else, if we haven't found ScalarTC, we use NumElements as it was, just + // as it was expanded from Elems. + LLVM_DEBUG(dbgs() << "ARM TP: Found NumElements: "; TCP.NumElements->dump()); + + if (!MatchElemCountLoopSetup(L, TCP.Shuffle, TCP.NumElements)) + return false; return true; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested-loop.ll @@ -0,0 +1,728 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s + +; This IR corresponds to a 2d loop, where the inner loop upper bound is +; determined by the outer loop: +; +; for (i = 0; i < N; i++) +; M = Size - i; +; for (j = 0; j < M; j++) +; // reduction +; +; This is results in SCEVAddRecExpr expression type. The value in this SCEV expression +; does not match the scalar trip count, and requires traversal of the use-def chain. +; That is, we start at [[CONV]], to find that [[TMP17]] sets the iteration count +; of the inner loop. +; +define dso_local void @SCEVAddRecExpr_2d_i16(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_2d_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], -4 +; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -4 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[CONV2]], [[I_037]] +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]] +; CHECK: vector.ph: +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP16]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>* +; CHECK-NEXT: [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>* +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21]] = sub i32 [[TMP19]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]] +; CHECK-NEXT: [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4 +; CHECK-NEXT: [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1) +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +; CHECK-NEXT: br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]]) +; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]]) +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP34]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]] +; CHECK: for.end17: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i16 %N to i32 + %cmp36 = icmp sgt i16 %N, 0 + br i1 %cmp36, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: ; preds = %entry + %conv2 = sext i16 %Size to i32 + %conv1032 = zext i16 %Scale to i32 + %0 = add i32 %conv2, 3 + br label %for.body + +for.body: ; preds = %for.end, %for.body.lr.ph + %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ] + %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ] + %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] + %1 = mul nsw i32 %i.037, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv2, %i.037 + %cmp433 = icmp slt i32 %i.037, %conv2 + br i1 %cmp433, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ] + %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>* + %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef) + %13 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef) + %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32> + %15 = mul nsw <4 x i32> %14, %13 + %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0 + %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer + %18 = ashr <4 x i32> %15, %17 + %19 = add <4 x i32> %18, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4 + %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer + %24 = icmp ule <4 x i32> %induction, %23 + %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi + %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ] + %27 = lshr i32 %Sum.0.lcssa, 16 + %conv13 = trunc i32 %27 to i16 + %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037 + store i16 %conv13, i16* %arrayidx14, align 2 + %inc16 = add nuw nsw i32 %i.037, 1 + %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1 + %lsr.iv.next = add i32 %lsr.iv53, -1 + %exitcond39 = icmp eq i32 %inc16, %conv + br i1 %exitcond39, label %for.end17, label %for.body + +for.end17: ; preds = %for.end, %entry + ret void +} + +; Slightly different case @SCEVAddRecExpr_2d_i16, where there is a sext using +; and defining the scalar in the entry block from where we start searching. This +; is absent here, and so our def-use chain traversal is slightly different. +; +define dso_local void @SCEVAddRecExpr_2d_i32(i32* nocapture readonly %Input, i32* nocapture %Output, i32 %Size, i32 %N, i32 %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_2d_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP29:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP29]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END11:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 3 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[LSR_IV46:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ] +; CHECK-NEXT: [[LSR_IV41:%.*]] = phi i32* [ [[INPUT:%.*]], [[FOR_BODY_PREHEADER]] ], [ [[SCEVGEP42:%.*]], [[FOR_END]] ] +; CHECK-NEXT: [[I_030:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[I_030]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SIZE]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], -4 +; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[I_030]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -4 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[SIZE]], [[I_030]] +; CHECK-NEXT: [[CMP226:%.*]] = icmp slt i32 [[I_030]], [[SIZE]] +; CHECK-NEXT: br i1 [[CMP226]], label [[VECTOR_PH:%.*]], label [[FOR_END]] +; CHECK: vector.ph: +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP16]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV43:%.*]] = phi i32* [ [[SCEVGEP44:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV41]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[LSR_IV4345:%.*]] = bitcast i32* [[LSR_IV43]] to <4 x i32>* +; CHECK-NEXT: [[LSR_IV40:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21]] = sub i32 [[TMP19]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV40]], i32 4, <4 x i1> [[TMP20]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV4345]], i32 4, <4 x i1> [[TMP20]], <4 x i32> undef) +; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD35]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> undef, i32 [[SCALE:%.*]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = ashr <4 x i32> [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP26]] = add <4 x i32> [[TMP25]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP44]] = getelementptr i32, i32* [[LSR_IV43]], i32 4 +; CHECK-NEXT: [[TMP27]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1) +; CHECK-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +; CHECK-NEXT: br i1 [[TMP28]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP26]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP29:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]]) +; CHECK-NEXT: [[TMP30:%.*]] = select <4 x i1> [[TMP29]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[CONV7:%.*]] = ashr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[OUTPUT:%.*]], i32 [[I_030]] +; CHECK-NEXT: store i32 [[CONV7]], i32* [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[INC10]] = add nuw nsw i32 [[I_030]], 1 +; CHECK-NEXT: [[SCEVGEP42]] = getelementptr i32, i32* [[LSR_IV41]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV46]], -1 +; CHECK-NEXT: [[EXITCOND32:%.*]] = icmp eq i32 [[INC10]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END11]], label [[FOR_BODY]] +; CHECK: for.end11: +; CHECK-NEXT: ret void +; +entry: + %cmp29 = icmp sgt i32 %N, 0 + br i1 %cmp29, label %for.body.preheader, label %for.end11 + +for.body.preheader: ; preds = %entry + %0 = add i32 %Size, 3 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.end + %lsr.iv46 = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.end ] + %lsr.iv41 = phi i32* [ %Input, %for.body.preheader ], [ %scevgep42, %for.end ] + %i.030 = phi i32 [ %inc10, %for.end ], [ 0, %for.body.preheader ] + %1 = mul nsw i32 %i.030, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %Size, %i.030 + %cmp226 = icmp slt i32 %i.030, %Size + br i1 %cmp226, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv43 = phi i32* [ %scevgep44, %vector.body ], [ %lsr.iv41, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %17, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %18, %vector.body ] + %lsr.iv4345 = bitcast i32* %lsr.iv43 to <4 x i32>* + %lsr.iv40 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv40, i32 4, <4 x i1> %12, <4 x i32> undef) + %wide.masked.load35 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv4345, i32 4, <4 x i1> %12, <4 x i32> undef) + %13 = mul nsw <4 x i32> %wide.masked.load35, %wide.masked.load + %14 = insertelement <4 x i32> undef, i32 %Scale, i32 0 + %15 = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> zeroinitializer + %16 = ashr <4 x i32> %13, %15 + %17 = add <4 x i32> %16, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep44 = getelementptr i32, i32* %lsr.iv43, i32 4 + %18 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %19 = icmp ne i32 %18, 0 + br i1 %19, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %21 = shufflevector <4 x i32> %20, <4 x i32> undef, <4 x i32> zeroinitializer + %22 = icmp ule <4 x i32> %induction, %21 + %23 = select <4 x i1> %22, <4 x i32> %17, <4 x i32> %vec.phi + %24 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %23) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %24, %middle.block ] + %conv7 = ashr i32 %Sum.0.lcssa, 16 + %arrayidx8 = getelementptr inbounds i32, i32* %Output, i32 %i.030 + store i32 %conv7, i32* %arrayidx8, align 4 + %inc10 = add nuw nsw i32 %i.030, 1 + %scevgep42 = getelementptr i32, i32* %lsr.iv41, i32 1 + %lsr.iv.next = add i32 %lsr.iv46, -1 + %exitcond32 = icmp eq i32 %inc10, %N + br i1 %exitcond32, label %for.end11, label %for.body + +for.end11: ; preds = %for.end, %entry + ret void +} + +; This is the almost the same as SCEVAddRecExpr_2d_i16, except that a loop invariant +; statement has been added to vector.body, so we can't tail-predicate this loop. +; +define dso_local void @SCEVAddRecExpr_2d_not_invariant(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_2d_not_invariant( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP36:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP36]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END17:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[CONV1032:%.*]] = zext i16 [[SCALE:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV2]], 3 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[LSR_IV53:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[LSR_IV48:%.*]] = phi i16* [ [[SCEVGEP49:%.*]], [[FOR_END]] ], [ [[INPUT:%.*]], [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[I_037:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], -4 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i32 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[CONV2]], [[I_037]] +; CHECK-NEXT: [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]] +; CHECK-NEXT: br i1 [[CMP433]], label [[VECTOR_PH:%.*]], label [[FOR_END]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP7]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV50:%.*]] = phi i16* [ [[SCEVGEP51:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV48]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[INPUT]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[CONV2_PHI:%.*]] = phi i32 [ [[CONV2]], [[VECTOR_PH]] ], [ [[CONV2_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[CONV2_NEXT]] = add i32 [[CONV2_PHI]], 4 +; CHECK-NEXT: [[LSR_IV5052:%.*]] = bitcast i16* [[LSR_IV50]] to <4 x i16>* +; CHECK-NEXT: [[LSR_IV47:%.*]] = bitcast i16* [[LSR_IV]] to <4 x i16>* +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP11]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV47]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV5052]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef) +; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP16]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = ashr <4 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19]] = add <4 x i32> [[TMP18]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 4 +; CHECK-NEXT: [[SCEVGEP51]] = getelementptr i16, i16* [[LSR_IV50]], i32 4 +; CHECK-NEXT: [[TMP20]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP9]], i32 1) +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = icmp ule <4 x i32> [[INDUCTION]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[TMP19]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP27]] to i16 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_037]] +; CHECK-NEXT: store i16 [[CONV13]], i16* [[ARRAYIDX14]], align 2 +; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[I_037]], 1 +; CHECK-NEXT: [[SCEVGEP49]] = getelementptr i16, i16* [[LSR_IV48]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV53]], -1 +; CHECK-NEXT: [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND39]], label [[FOR_END17]], label [[FOR_BODY]] +; CHECK: for.end17: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i16 %N to i32 + %cmp36 = icmp sgt i16 %N, 0 + br i1 %cmp36, label %for.body.lr.ph, label %for.end17 + +for.body.lr.ph: ; preds = %entry + %conv2 = sext i16 %Size to i32 + %conv1032 = zext i16 %Scale to i32 + %0 = add i32 %conv2, 3 + br label %for.body + +for.body: ; preds = %for.end, %for.body.lr.ph + %lsr.iv53 = phi i32 [ %lsr.iv.next, %for.end ], [ %0, %for.body.lr.ph ] + %lsr.iv48 = phi i16* [ %scevgep49, %for.end ], [ %Input, %for.body.lr.ph ] + %i.037 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] + %1 = mul nsw i32 %i.037, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv2, %i.037 + %cmp433 = icmp slt i32 %i.037, %conv2 + br i1 %cmp433, label %vector.ph, label %for.end + +vector.ph: ; preds = %for.body + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv50 = phi i16* [ %scevgep51, %vector.body ], [ %lsr.iv48, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %Input, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ] + +; Loop invariant statement added here: + + %conv2.phi = phi i32 [ %conv2, %vector.ph ], [ %conv2.next, %vector.body ] + %conv2.next = add i32 %conv2.phi, 4 + + %lsr.iv5052 = bitcast i16* %lsr.iv50 to <4 x i16>* + %lsr.iv47 = bitcast i16* %lsr.iv to <4 x i16>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv47, i32 2, <4 x i1> %12, <4 x i16> undef) + %13 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv5052, i32 2, <4 x i1> %12, <4 x i16> undef) + %14 = sext <4 x i16> %wide.masked.load42 to <4 x i32> + %15 = mul nsw <4 x i32> %14, %13 + %16 = insertelement <4 x i32> undef, i32 %conv1032, i32 0 + %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer + %18 = ashr <4 x i32> %15, %17 + %19 = add <4 x i32> %18, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep51 = getelementptr i16, i16* %lsr.iv50, i32 4 + %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer + %24 = icmp ule <4 x i32> %induction, %23 + %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi + %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25) + br label %for.end + +for.end: ; preds = %middle.block, %for.body + %Sum.0.lcssa = phi i32 [ 0, %for.body ], [ %26, %middle.block ] + %27 = lshr i32 %Sum.0.lcssa, 16 + %conv13 = trunc i32 %27 to i16 + %arrayidx14 = getelementptr inbounds i16, i16* %Output, i32 %i.037 + store i16 %conv13, i16* %arrayidx14, align 2 + %inc16 = add nuw nsw i32 %i.037, 1 + %scevgep49 = getelementptr i16, i16* %lsr.iv48, i32 1 + %lsr.iv.next = add i32 %lsr.iv53, -1 + %exitcond39 = icmp eq i32 %inc16, %conv + br i1 %exitcond39, label %for.end17, label %for.body + +for.end17: ; preds = %for.end, %entry + ret void +} + +; This IR corresponds to this 3d loop: +; +; for (k = 0; k < N; k++) +; for (i = 0; i < N; i++) +; M = Size - i; +; for (j = 0; j < M; j++) +; // reduction +; +; Inner loop j depends on its outerloop i, but not on its most outerloop k. +; Thus, the SCEV expression is also a SCEVAddRecExpr, and we should +; tail-predicate this. +; +define dso_local void @SCEVAddRecExpr_3d(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { +; CHECK-LABEL: @SCEVAddRecExpr_3d( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP52:%.*]] = icmp sgt i16 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP52]], label [[FOR_COND2_PREHEADER_LR_PH:%.*]], label [[FOR_END26:%.*]] +; CHECK: for.cond2.preheader.lr.ph: +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[CONV1645:%.*]] = zext i16 [[SCALE:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CONV7]], 3 +; CHECK-NEXT: br label [[FOR_COND2_PREHEADER_US:%.*]] +; CHECK: for.cond2.preheader.us: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i16* [ [[SCEVGEP:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US:%.*]] ], [ [[INPUT:%.*]], [[FOR_COND2_PREHEADER_LR_PH]] ] +; CHECK-NEXT: [[K_053_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_LR_PH]] ], [ [[INC25_US:%.*]], [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]] ] +; CHECK-NEXT: br label [[FOR_BODY6_US:%.*]] +; CHECK: for.body6.us: +; CHECK-NEXT: [[LSR_IV72:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_END_US:%.*]] ], [ [[TMP0]], [[FOR_COND2_PREHEADER_US]] ] +; CHECK-NEXT: [[LSR_IV67:%.*]] = phi i16* [ [[SCEVGEP68:%.*]], [[FOR_END_US]] ], [ [[INPUT]], [[FOR_COND2_PREHEADER_US]] ] +; CHECK-NEXT: [[I_050_US:%.*]] = phi i32 [ 0, [[FOR_COND2_PREHEADER_US]] ], [ [[INC22_US:%.*]], [[FOR_END_US]] ] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[I_050_US]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[CONV7]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], -4 +; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = shl i32 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[I_050_US]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP0]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -4 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[CONV7]], [[I_050_US]] +; CHECK-NEXT: [[CMP946_US:%.*]] = icmp slt i32 [[I_050_US]], [[CONV7]] +; CHECK-NEXT: br i1 [[CMP946_US]], label [[VECTOR_PH:%.*]], label [[FOR_END_US]] +; CHECK: vector.ph: +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP16]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[LSR_IV69:%.*]] = phi i16* [ [[SCEVGEP70:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV67]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[LSR_IV64:%.*]] = phi i16* [ [[SCEVGEP65:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV]], [[VECTOR_PH]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP16]], [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ [[TMP17]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[LSR_IV6971:%.*]] = bitcast i16* [[LSR_IV69]] to <4 x i16>* +; CHECK-NEXT: [[LSR_IV6466:%.*]] = bitcast i16* [[LSR_IV64]] to <4 x i16>* +; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21]] = sub i32 [[TMP19]], 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6466]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP22:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[WIDE_MASKED_LOAD59:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[LSR_IV6971]], i32 2, <4 x i1> [[TMP20]], <4 x i16> undef) +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD59]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1645]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = ashr <4 x i32> [[TMP24]], [[TMP26]] +; CHECK-NEXT: [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI]] +; CHECK-NEXT: [[SCEVGEP65]] = getelementptr i16, i16* [[LSR_IV64]], i32 4 +; CHECK-NEXT: [[SCEVGEP70]] = getelementptr i16, i16* [[LSR_IV69]], i32 4 +; CHECK-NEXT: [[TMP29]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP18]], i32 1) +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +; CHECK-NEXT: br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[VEC_PHI_LCSSA:%.*]] = phi <4 x i32> [ [[VEC_PHI]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP28]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP31:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP9]]) +; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP31]], <4 x i32> [[DOTLCSSA]], <4 x i32> [[VEC_PHI_LCSSA]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP32]]) +; CHECK-NEXT: br label [[FOR_END_US]] +; CHECK: for.end.us: +; CHECK-NEXT: [[SUM_0_LCSSA_US:%.*]] = phi i32 [ 0, [[FOR_BODY6_US]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = lshr i32 [[SUM_0_LCSSA_US]], 16 +; CHECK-NEXT: [[CONV19_US:%.*]] = trunc i32 [[TMP34]] to i16 +; CHECK-NEXT: [[ARRAYIDX20_US:%.*]] = getelementptr inbounds i16, i16* [[OUTPUT:%.*]], i32 [[I_050_US]] +; CHECK-NEXT: store i16 [[CONV19_US]], i16* [[ARRAYIDX20_US]], align 2 +; CHECK-NEXT: [[INC22_US]] = add nuw nsw i32 [[I_050_US]], 1 +; CHECK-NEXT: [[SCEVGEP68]] = getelementptr i16, i16* [[LSR_IV67]], i32 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV72]], -1 +; CHECK-NEXT: [[EXITCOND55:%.*]] = icmp eq i32 [[INC22_US]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND55]], label [[FOR_COND2_FOR_INC24_CRIT_EDGE_US]], label [[FOR_BODY6_US]] +; CHECK: for.cond2.for.inc24_crit_edge.us: +; CHECK-NEXT: [[INC25_US]] = add nuw nsw i32 [[K_053_US]], 1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i16, i16* [[LSR_IV]], i32 1 +; CHECK-NEXT: [[EXITCOND56:%.*]] = icmp eq i32 [[INC25_US]], [[CONV]] +; CHECK-NEXT: br i1 [[EXITCOND56]], label [[FOR_END26]], label [[FOR_COND2_PREHEADER_US]] +; CHECK: for.end26: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i16 %N to i32 + %cmp52 = icmp sgt i16 %N, 0 + br i1 %cmp52, label %for.cond2.preheader.lr.ph, label %for.end26 + +for.cond2.preheader.lr.ph: ; preds = %entry + %conv7 = sext i16 %Size to i32 + %conv1645 = zext i16 %Scale to i32 + %0 = add i32 %conv7, 3 + br label %for.cond2.preheader.us + +for.cond2.preheader.us: ; preds = %for.cond2.for.inc24_crit_edge.us, %for.cond2.preheader.lr.ph + %lsr.iv = phi i16* [ %scevgep, %for.cond2.for.inc24_crit_edge.us ], [ %Input, %for.cond2.preheader.lr.ph ] + %k.053.us = phi i32 [ 0, %for.cond2.preheader.lr.ph ], [ %inc25.us, %for.cond2.for.inc24_crit_edge.us ] + br label %for.body6.us + +for.body6.us: ; preds = %for.end.us, %for.cond2.preheader.us + %lsr.iv72 = phi i32 [ %lsr.iv.next, %for.end.us ], [ %0, %for.cond2.preheader.us ] + %lsr.iv67 = phi i16* [ %scevgep68, %for.end.us ], [ %Input, %for.cond2.preheader.us ] + %i.050.us = phi i32 [ 0, %for.cond2.preheader.us ], [ %inc22.us, %for.end.us ] + %1 = mul nsw i32 %i.050.us, -1 + %2 = add i32 %0, %1 + %3 = lshr i32 %2, 2 + %4 = shl nuw i32 %3, 2 + %5 = add i32 %4, -4 + %6 = lshr i32 %5, 2 + %7 = add nuw nsw i32 %6, 1 + %8 = sub i32 %conv7, %i.050.us + %cmp946.us = icmp slt i32 %i.050.us, %conv7 + br i1 %cmp946.us, label %vector.ph, label %for.end.us + +vector.ph: ; preds = %for.body6.us + %trip.count.minus.1 = add i32 %8, -1 + call void @llvm.set.loop.iterations.i32(i32 %7) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv69 = phi i16* [ %scevgep70, %vector.body ], [ %lsr.iv67, %vector.ph ] + %lsr.iv64 = phi i16* [ %scevgep65, %vector.body ], [ %lsr.iv, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %19, %vector.body ] + %9 = phi i32 [ %7, %vector.ph ], [ %20, %vector.body ] + %lsr.iv6971 = bitcast i16* %lsr.iv69 to <4 x i16>* + %lsr.iv6466 = bitcast i16* %lsr.iv64 to <4 x i16>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = or <4 x i32> %broadcast.splat, + %10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp ule <4 x i32> %induction, %11 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6466, i32 2, <4 x i1> %12, <4 x i16> undef) + %13 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load59 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv6971, i32 2, <4 x i1> %12, <4 x i16> undef) + %14 = sext <4 x i16> %wide.masked.load59 to <4 x i32> + %15 = mul nsw <4 x i32> %14, %13 + %16 = insertelement <4 x i32> undef, i32 %conv1645, i32 0 + %17 = shufflevector <4 x i32> %16, <4 x i32> undef, <4 x i32> zeroinitializer + %18 = ashr <4 x i32> %15, %17 + %19 = add <4 x i32> %18, %vec.phi + %index.next = add i32 %index, 4 + %scevgep65 = getelementptr i16, i16* %lsr.iv64, i32 4 + %scevgep70 = getelementptr i16, i16* %lsr.iv69, i32 4 + %20 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %9, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %22 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %23 = shufflevector <4 x i32> %22, <4 x i32> undef, <4 x i32> zeroinitializer + %24 = icmp ule <4 x i32> %induction, %23 + %25 = select <4 x i1> %24, <4 x i32> %19, <4 x i32> %vec.phi + %26 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %25) + br label %for.end.us + +for.end.us: ; preds = %middle.block, %for.body6.us + %Sum.0.lcssa.us = phi i32 [ 0, %for.body6.us ], [ %26, %middle.block ] + %27 = lshr i32 %Sum.0.lcssa.us, 16 + %conv19.us = trunc i32 %27 to i16 + %arrayidx20.us = getelementptr inbounds i16, i16* %Output, i32 %i.050.us + store i16 %conv19.us, i16* %arrayidx20.us, align 2 + %inc22.us = add nuw nsw i32 %i.050.us, 1 + %scevgep68 = getelementptr i16, i16* %lsr.iv67, i32 1 + %lsr.iv.next = add i32 %lsr.iv72, -1 + %exitcond55 = icmp eq i32 %inc22.us, %conv + br i1 %exitcond55, label %for.cond2.for.inc24_crit_edge.us, label %for.body6.us + +for.cond2.for.inc24_crit_edge.us: ; preds = %for.end.us + %inc25.us = add nuw nsw i32 %k.053.us, 1 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 1 + %exitcond56 = icmp eq i32 %inc25.us, %conv + br i1 %exitcond56, label %for.end26, label %for.cond2.preheader.us + +for.end26: ; preds = %for.cond2.for.inc24_crit_edge.us, %entry + ret void +} + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>) +declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)