diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -699,7 +699,7 @@ const LoopAccessInfo *LAI) : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} - ~InterleavedAccessInfo() { reset(); } + ~InterleavedAccessInfo() { invalidateGroups(); } /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. @@ -710,16 +710,18 @@ /// Invalidate groups, e.g., in case all blocks in loop will be predicated /// contrary to original assumption. Although we currently prevent group /// formation for predicated accesses, we may be able to relax this limitation - /// in the future once we handle more complicated blocks. - void reset() { + /// in the future once we handle more complicated blocks. Returns true if any + /// groups were invalidated. + bool invalidateGroups() { InterleaveGroupMap.clear(); for (auto *Ptr : InterleaveGroups) delete Ptr; + bool Changed = !InterleaveGroups.empty(); InterleaveGroups.clear(); RequiresScalarEpilogue = false; + return Changed; } - /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end(); @@ -746,8 +748,9 @@ /// Invalidate groups that require a scalar epilogue (due to gaps). This can /// happen when optimizing for size forbids a scalar epilogue, and the gap - /// cannot be filtered by masking the load/store. - void invalidateGroupsRequiringScalarEpilogue(); + /// cannot be filtered by masking the load/store. Returns true, if any groups + /// have been invalidated. + bool invalidateGroupsRequiringScalarEpilogue(); private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1230,11 +1230,11 @@ } } -void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { +bool InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { // If no group had triggered the requirement to create an epilogue loop, // there is nothing to do. if (!requiresScalarEpilogue()) - return; + return false; // Avoid releasing a Group twice. SmallPtrSet *, 4> DelSet; @@ -1253,6 +1253,7 @@ } RequiresScalarEpilogue = false; + return !DelSet.empty(); } template diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1309,6 +1309,13 @@ /// i.e. either vector version isn't available, or is too expensive. unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + /// Invalidates decisions already taken by the cost model. + void invalidateCostModelingDecisions() { + WideningDecisions.clear(); + Uniforms.clear(); + Scalars.clear(); + } + private: unsigned NumPredStores = 0; @@ -4978,7 +4985,11 @@ // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. if (!useMaskedInterleavedAccesses(TTI)) - InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + if (InterleaveInfo.invalidateGroupsRequiringScalarEpilogue()) + // Invalidating interleave groups also requires invalidating all decisions + // based on them, which includes widening decisions and uniform and scalar + // values. + invalidateCostModelingDecisions(); unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { @@ -6517,7 +6528,11 @@ dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " "which requires masked-interleaved support.\n"); - CM.InterleaveInfo.reset(); + if (CM.InterleaveInfo.invalidateGroups()) + // Invalidating interleave groups also requires invalidating all decisions + // based on them, which includes widening decisions and uniform and scalar + // values. + CM.invalidateCostModelingDecisions(); } if (UserVF) { diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll @@ -0,0 +1,55 @@ +; RUN: opt -loop-vectorize -hexagon-autohvx=1 -force-vector-width=64 -prefer-predicate-over-epilog -S %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +; Test for PR45572. + +; Check that interleave groups and decisions based on them are correctly +; invalidated with tail-folding on platforms where masked interleaved accesses +; are disabled. + +; Make sure a vector body has been created, 64 element vectors are used and a block predicate has been computed. +; CHECK-LABEL: vector.body: +; CHECK: %induction = add <64 x i32> +; CHECK: icmp ule <64 x i32> %induction + +define void @foo(i32* %arg, i32 %N) #0 { +entry: + %tmp = alloca i8 + br label %loop + +loop: ; preds = %bb2, %bb + %iv = phi i32 [ %iv.next, %loop], [ 0, %entry ] + %idx.mul = mul nuw nsw i32 %iv, 7 + %idx.start = add nuw nsw i32 %idx.mul, 1 + %tmp6 = getelementptr inbounds i32, i32* %arg, i32 %idx.start + %tmp7 = load i32, i32* %tmp6, align 4 + %tmp8 = add nuw nsw i32 %idx.start, 1 + %tmp9 = getelementptr inbounds i32, i32* %arg, i32 %tmp8 + %tmp10 = load i32, i32* %tmp9, align 4 + %tmp11 = add nuw nsw i32 %idx.start, 2 + %tmp12 = getelementptr inbounds i32, i32* %arg, i32 %tmp11 + %tmp13 = load i32, i32* %tmp12, align 4 + %tmp14 = add nuw nsw i32 %idx.start, 3 + %tmp15 = getelementptr inbounds i32, i32* %arg, i32 %tmp14 + %tmp16 = load i32, i32* %tmp15, align 4 + %tmp18 = add nuw nsw i32 %idx.start, 4 + %tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp18 + %tmp20 = load i32, i32* %tmp19, align 4 + %tmp21 = add nuw nsw i32 %idx.start, 5 + %tmp22 = getelementptr inbounds i32, i32* %arg, i32 %tmp21 + %tmp23 = load i32, i32* %tmp22, align 4 + %tmp25 = add nuw nsw i32 %idx.start, 6 + %tmp26 = getelementptr inbounds i32, i32* %arg, i32 %tmp25 + %tmp27 = load i32, i32* %tmp26, align 4 + store i8 0, i8* %tmp, align 1 + %iv.next= add nuw nsw i32 %iv, 1 + %exit.cond = icmp eq i32 %iv.next, %N + br i1 %exit.cond, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +attributes #0 = { "target-features"="+hvx,+hvx-length128b" }