Index: llvm/include/llvm/Analysis/VectorUtils.h =================================================================== --- llvm/include/llvm/Analysis/VectorUtils.h +++ llvm/include/llvm/Analysis/VectorUtils.h @@ -811,6 +811,8 @@ /// cannot be filtered by masking the load/store. void invalidateGroupsRequiringScalarEpilogue(); + bool haveGroups() const { return !InterleaveGroups.empty(); } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5063,9 +5063,23 @@ // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) { + if (InterleaveInfo.haveGroups() && !useMaskedInterleavedAccesses(TTI)) { assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"); + + // If the target doesn't have masked interleaved accesses, then it's very + // likely the costs will be far too high to consider vectorising, e.g. see + // where useEmulatedMaskMemRefHack is used. If we're permitted to fall back + // on an unpredicated vector loop + scalar epilogue then let's do it now. + if (UserVF.isZero() && + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + LLVM_DEBUG(dbgs() << "LV: Not folding tail by masking due to " + "interleaving: vectorize with a scalar epilogue " + "instead.\n"); + ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + return computeFeasibleMaxVF(TC, UserVF, false); + } + // Note: There is no need to invalidate any cost modeling decisions here, as // non where taken so far. InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-interleave.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-interleave.ll @@ -0,0 +1,67 @@ +; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -debug < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck --check-prefix=DEBUG %s + +target triple = "aarch64-unknown-linux-gnu" + +; DEBUG: Not folding tail by masking due to interleaving: vectorize with a scalar epilogue instead. + +define void @foo(ptr noalias nocapture noundef writeonly %dst, ptr noalias nocapture noundef readonly %src, i64 noundef %n) #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 -2 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[STRIDED_VEC]], <4 x float> [[STRIDED_VEC1]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> , <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x float> [[TMP9]], <12 x float> poison, <12 x i32> +; CHECK-NEXT: store <12 x float> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.021 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %mul = shl nuw nsw i64 %i.021, 1 + %arrayidx = getelementptr inbounds float, ptr %src, i64 %mul + %0 = load float, ptr %arrayidx, align 4 + %mul1 = mul nuw nsw i64 %i.021, 3 + %arrayidx2 = getelementptr inbounds float, ptr %dst, i64 %mul1 + store float %0, ptr %arrayidx2, align 4 + %add = or i64 %mul, 1 + %arrayidx4 = getelementptr inbounds float, ptr %src, i64 %add + %1 = load float, ptr %arrayidx4, align 4 + %add6 = add nuw nsw i64 %mul1, 1 + %arrayidx7 = getelementptr inbounds float, ptr %dst, i64 %add6 + store float %1, ptr %arrayidx7, align 4 + %add9 = add nuw nsw i64 %mul1, 2 + %arrayidx10 = getelementptr inbounds float, ptr %dst, i64 %add9 + store float 3.000000e+00, ptr %arrayidx10, align 4 + %inc = add nuw nsw i64 %i.021, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } Index: llvm/test/Transforms/LoopVectorize/tail-folding-interleave.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/tail-folding-interleave.ll @@ -0,0 +1,33 @@ +; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -enable-interleaved-mem-accesses -debug -disable-output < %s 2>&1 | FileCheck %s + +; CHECK: Not folding tail by masking due to interleaving: vectorize with a scalar epilogue instead. + +define void @foo(ptr noalias nocapture noundef writeonly %dst, ptr noalias nocapture noundef readonly %src, i64 noundef %n) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.021 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %mul = shl nuw nsw i64 %i.021, 1 + %arrayidx = getelementptr inbounds float, ptr %src, i64 %mul + %0 = load float, ptr %arrayidx, align 4 + %mul1 = mul nuw nsw i64 %i.021, 3 + %arrayidx2 = getelementptr inbounds float, ptr %dst, i64 %mul1 + store float %0, ptr %arrayidx2, align 4 + %add = or i64 %mul, 1 + %arrayidx4 = getelementptr inbounds float, ptr %src, i64 %add + %1 = load float, ptr %arrayidx4, align 4 + %add6 = add nuw nsw i64 %mul1, 1 + %arrayidx7 = getelementptr inbounds float, ptr %dst, i64 %add6 + store float %1, ptr %arrayidx7, align 4 + %add9 = add nuw nsw i64 %mul1, 2 + %arrayidx10 = getelementptr inbounds float, ptr %dst, i64 %add9 + store float 3.000000e+00, ptr %arrayidx10, align 4 + %inc = add nuw nsw i64 %i.021, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +}