Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2561,7 +2561,6 @@ auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); @@ -3298,10 +3297,9 @@ if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - assert(!VF.isScalable() && - "Tail folding not yet supported for scalable vectors"); + Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); TC = Builder.CreateAdd( - TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); + TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -5755,10 +5753,12 @@ } } - // For scalable vectors, don't use tail folding as this is currently not yet - // supported. The code is likely to have ended up here if the tripcount is - // low, in which case it makes sense not to use scalable vectors. - if (MaxFactors.ScalableVF.isVector()) + // For scalable vectors don't use tail folding for low trip counts or + // optimizing for code size. We only permit this if the user has explicitly + // requested it. + if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && + ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate + && MaxFactors.ScalableVF.isVector()) MaxFactors.ScalableVF = ElementCount::getScalable(0); // If we don't know the precise trip count, or if the trip count that we Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -0,0 +1,70 @@ +; RUN: opt -S -loop-vectorize -scalable-vectorization=preferred < %s | FileCheck %s + +; These tests ensure that tail-folding is enabled when the predicate.enable +; loop attribute is set to true. + +target triple = "aarch64-unknown-linux-gnu" + + +define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { +; CHECK-LABEL: @simple_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP12]], i32 4, [[TMP9]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, i32* %ptr, i64 %index + store i32 %val, i32* %gep + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 + +while.end.loopexit: ; preds = %while.body + ret void +} + + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -1,23 +1,555 @@ ; RUN: opt -S -loop-vectorize -scalable-vectorization=preferred -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s - -; CHECK-NOT: vector.body: +; RUN: opt -S -loop-vectorize -scalable-vectorization=preferred -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" -define void @tail_predication(i32 %init, i32* %ptr, i32 %val) #0 { + +define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { +; CHECK-LABEL: @simple_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT6]], * [[TMP12]], i32 4, [[TMP9]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; entry: br label %while.body while.body: ; preds = %while.body, %entry - %index = phi i32 [ %index.dec, %while.body ], [ %init, %entry ] - %gep = getelementptr i32, i32* %ptr, i32 %index + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, i32* %ptr, i64 %index store i32 %val, i32* %gep - %index.dec = add nsw i32 %index, -1 - %cmp10 = icmp sgt i32 %index, 0 + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret void +} + + +define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { +; CHECK-LABEL: @simple_memcpy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP15]], i32 4, [[TMP9]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep1 = getelementptr i32, i32* %src, i64 %index + %val = load i32, i32* %gep1 + %gep2 = getelementptr i32, i32* %dst, i64 %index + store i32 %val, i32* %gep2 + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret void +} + + +define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* noalias %ind, i64 %n) #0 { +; CHECK-LABEL: @simple_gather_scatter( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, [[TMP9]], undef) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[TMP9]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep1 = getelementptr i32, i32* %ind, i64 %index + %ind_val = load i32, i32* %gep1 + %gep2 = getelementptr i32, i32* %src, i32 %ind_val + %val = load i32, i32* %gep2 + %gep3 = getelementptr i32, i32* %dst, i32 %ind_val + store i32 %val, i32* %gep3 + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret void +} + + +; The original loop had an unconditional uniform load. Let's make sure +; we don't artificially create new predicated blocks for the load. +define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 { +; CHECK-LABEL: @uniform_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[BROADCAST_SPLAT4]], * [[TMP13]], i32 4, [[TMP9]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph +; + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %val = load i32, i32* %src, align 4 + %arrayidx = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv + store i32 %val, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + + +; The original loop had a conditional uniform load. In this case we actually +; do need to perform conditional loads and so we end up using a gather instead. +; However, we at least ensure the mask is the overlap of the loop predicate +; and the original condition. +define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 { +; CHECK-LABEL: @cond_uniform_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32* [[SRC:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP9]], [[TMP14]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT6]], i32 4, [[TMP15]], undef) +; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP9]], [[TMP13]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[PREDPHI]], * [[TMP20]], i32 4, [[TMP18]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label %middle.block, label %vector.body +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph +; + +entry: + br label %for.body + +for.body: ; preds = %entry, %if.end + %index = phi i64 [ %index.next, %if.end ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %index + %0 = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: ; preds = %for.body + %1 = load i32, i32* %src, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %val.0 = phi i32 [ %1, %if.then ], [ 0, %for.body ] + %arrayidx1 = getelementptr inbounds i32, i32* %dst, i64 %index + store i32 %val.0, i32* %arrayidx1, align 4 + %index.next = add nuw i64 %index, 1 + %exitcond.not = icmp eq i64 %index.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.inc, %entry + ret void +} + + +define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 { +; CHECK-LABEL: @simple_fdiv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP13]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP15]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD5]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP16]], * [[TMP17]], i32 4, [[TMP9]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label %middle.block, label %vector.body, !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep1 = getelementptr float, float* %src, i64 %index + %gep2 = getelementptr float, float* %dst, i64 %index + %val1 = load float, float* %gep1 + %val2 = load float, float* %gep2 + %res = fdiv float %val1, %val2 + store float %res, float* %gep2 + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n br i1 %cmp10, label %while.body, label %while.end.loopexit while.end.loopexit: ; preds = %while.body ret void } + +define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 { +; CHECK-LABEL: @add_reduction_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP13:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP13]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP14:%.*]] = select [[TMP9]], [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body, !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %red = phi i32 [ %red.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, i32* %ptr, i64 %index + %val = load i32, i32* %gep + %red.next = add i32 %red, %val + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret i32 %red.next +} + + +define float @add_reduction_f32(float* %ptr, i64 %n) #0 { +; CHECK-LABEL: @add_reduction_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[TMP14:%.*]], %vector.body ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[INDEX1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ule [[INDUCTION]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP12]], i32 4, [[TMP9]], poison) +; CHECK-NEXT: [[TMP13:%.*]] = select [[TMP9]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body, !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %red = phi float [ %red.next, %while.body ], [ 0.000000, %entry ] + %gep = getelementptr float, float* %ptr, i64 %index + %val = load float, float* %gep + %red.next = fadd float %red, %val + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret float %red.next +} + + +; Negative tests where we don't expect tail-folding + +; Integer divides can throw exceptions and since we can't scalarize conditional +; divides for scalable vectors we just don't bother vectorizing. +define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { +; CHECK-LABEL: @simple_idiv( +; CHECK-NOT: vector.body +; +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep1 = getelementptr i32, i32* %src, i64 %index + %gep2 = getelementptr i32, i32* %dst, i64 %index + %val1 = load i32, i32* %gep1 + %val2 = load i32, i32* %gep2 + %res = udiv i32 %val1, %val2 + store i32 %res, i32* %gep2 + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body + ret void +} + + attributes #0 = { "target-features"="+sve" }