diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -698,7 +698,7 @@ const LoopAccessInfo *LAI) : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} - ~InterleavedAccessInfo() { reset(); } + ~InterleavedAccessInfo() { invalidateGroups(); } /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. @@ -709,16 +709,24 @@ /// Invalidate groups, e.g., in case all blocks in loop will be predicated /// contrary to original assumption. Although we currently prevent group /// formation for predicated accesses, we may be able to relax this limitation - /// in the future once we handle more complicated blocks. - void reset() { + /// in the future once we handle more complicated blocks. Returns true if any + /// groups were invalidated. + bool invalidateGroups() { + if (InterleaveGroups.empty()) { + assert( + !RequiresScalarEpilogue && + "RequiresScalarEpilog should not be set without interleave groups"); + return false; + } + InterleaveGroupMap.clear(); for (auto *Ptr : InterleaveGroups) delete Ptr; InterleaveGroups.clear(); RequiresScalarEpilogue = false; + return true; } - /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end(); diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1243,6 +1243,8 @@ if (Group->requiresScalarEpilogue()) DelSet.insert(Group); } + assert(!DelSet.empty() && "At least one group must be invalidated, as a " + "scalar epilogue was required"); for (auto *Ptr : DelSet) { LLVM_DEBUG( dbgs() diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1309,6 +1309,13 @@ /// i.e. either vector version isn't available, or is too expensive. unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + /// Invalidates decisions already taken by the cost model. + void invalidateCostModelingDecisions() { + WideningDecisions.clear(); + Uniforms.clear(); + Scalars.clear(); + } + private: unsigned NumPredStores = 0; @@ -4977,8 +4984,13 @@ // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) { + assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && + "No decisions should have been taken at this point"); + // Note: There is no need to invalidate any cost modeling decisions here, as + // non where taken so far. InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + } unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { @@ -6517,7 +6529,11 @@ dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"); - CM.InterleaveInfo.reset(); + if (CM.InterleaveInfo.invalidateGroups()) + // Invalidating interleave groups also requires invalidating all decisions + // based on them, which includes widening decisions and uniform and scalar + // values. + CM.invalidateCostModelingDecisions(); } if (UserVF) { diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll @@ -0,0 +1,96 @@ +; RUN: opt -loop-vectorize -hexagon-autohvx=1 -force-vector-width=64 -prefer-predicate-over-epilog -S %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; Test for PR45572. + +; Check that interleave groups and decisions based on them are correctly +; invalidated with tail-folding on platforms where masked interleaved accesses +; are disabled. + +; Make sure a vector body has been created, 64 element vectors are used and a block predicate has been computed. +; Also make sure the loads are not widened. + +; CHECK-LABEL: @test1 +; CHECK: vector.body: +; CHECK: %induction = add <64 x i32> +; CHECK: icmp ule <64 x i32> %induction +; CHECK-NOT: load <{{.*}} x i32> + + +define void @test1(i32* %arg, i32 %N) #0 { +entry: + %tmp = alloca i8 + br label %loop + +loop: ; preds = %bb2, %bb + %iv = phi i32 [ %iv.next, %loop], [ 0, %entry ] + %idx.mul = mul nuw nsw i32 %iv, 7 + %idx.start = add nuw nsw i32 %idx.mul, 1 + %tmp6 = getelementptr inbounds i32, i32* %arg, i32 %idx.start + %tmp7 = load i32, i32* %tmp6, align 4 + %tmp8 = add nuw nsw i32 %idx.start, 1 + %tmp9 = getelementptr inbounds i32, i32* %arg, i32 %tmp8 + %tmp10 = load i32, i32* %tmp9, align 4 + %tmp11 = add nuw nsw i32 %idx.start, 2 + %tmp12 = getelementptr inbounds i32, i32* %arg, i32 %tmp11 + %tmp13 = load i32, i32* %tmp12, align 4 + %tmp14 = add nuw nsw i32 %idx.start, 3 + %tmp15 = getelementptr inbounds i32, i32* %arg, i32 %tmp14 + %tmp16 = load i32, i32* %tmp15, align 4 + %tmp18 = add nuw nsw i32 %idx.start, 4 + %tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp18 + %tmp20 = load i32, i32* %tmp19, align 4 + %tmp21 = add nuw nsw i32 %idx.start, 5 + %tmp22 = getelementptr inbounds i32, i32* %arg, i32 %tmp21 + %tmp23 = load i32, i32* %tmp22, align 4 + %tmp25 = add nuw nsw i32 %idx.start, 6 + %tmp26 = getelementptr inbounds i32, i32* %arg, i32 %tmp25 + %tmp27 = load i32, i32* %tmp26, align 4 + store i8 0, i8* %tmp, align 1 + %iv.next= add nuw nsw i32 %iv, 1 + %exit.cond = icmp eq i32 %iv.next, %N + br i1 %exit.cond, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +; The loop below only requires tail folding due to interleave groups with gaps. +; Make sure the loads are not widened. + +; CHECK-LABEL: @test2 +; CHECK: vector.body: +; CHECK-NOT: load <{{.*}} x i32> +define void @test2(i32* %arg) #1 { +entry: + %tmp = alloca i8 + br label %loop + +loop: ; preds = %bb2, %bb + %iv = phi i32 [ %iv.next, %loop], [ 0, %entry ] + %idx.start = mul nuw nsw i32 %iv, 5 + %tmp6 = getelementptr inbounds i32, i32* %arg, i32 %idx.start + %tmp7 = load i32, i32* %tmp6, align 4 + %tmp8 = add nuw nsw i32 %idx.start, 1 + %tmp9 = getelementptr inbounds i32, i32* %arg, i32 %tmp8 + %tmp10 = load i32, i32* %tmp9, align 4 + %tmp11 = add nuw nsw i32 %idx.start, 2 + %tmp12 = getelementptr inbounds i32, i32* %arg, i32 %tmp11 + %tmp13 = load i32, i32* %tmp12, align 4 + %tmp14 = add nuw nsw i32 %idx.start, 3 + %tmp15 = getelementptr inbounds i32, i32* %arg, i32 %tmp14 + %tmp16 = load i32, i32* %tmp15, align 4 + store i8 0, i8* %tmp, align 1 + %iv.next= add nuw nsw i32 %iv, 1 + %exit.cond = icmp eq i32 %iv.next, 128 + br i1 %exit.cond, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + + +attributes #0 = { "target-features"="+hvx,+hvx-length128b" } +attributes #1 = { optsize "target-features"="+hvx,+hvx-length128b" }