diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1676,8 +1676,13 @@ /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. - VectorizationCostTy expectedCost(ElementCount VF); + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + using InstructionVFPair = std::pair; + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -6075,12 +6080,13 @@ ChosenFactor.Cost = InstructionCost::getMax(); } + SmallVector InvalidCosts; for (const auto &i : VFCandidates) { // The cost for scalar VF=1 is already calculated, so ignore it. if (i.isScalar()) continue; - VectorizationCostTy C = expectedCost(i); + VectorizationCostTy C = expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " @@ -6103,6 +6109,55 @@ ChosenFactor = Candidate; } + // Emit a report of VFs with invalid costs in the loop. + if (!InvalidCosts.empty()) { + // Sort/group per instruction + llvm::sort(InvalidCosts, [](InstructionVFPair &A, InstructionVFPair &B) { + ElementCountComparator ECC; + return A.first->comesBefore(B.first) || ECC(A.second, B.second); + }); + + // For a list of ordered instruction-vf pairs: + // [(load, vf1), (load, vf2), (store, vf1)] + // Group the instructions together to emit separate remarks for: + // load (vf1, vf2) + // store (vf1) + auto Tail = ArrayRef(InvalidCosts); + auto Subset = ArrayRef(); + do { + if (Subset.empty()) + Subset = Tail.take_front(1); + + Instruction *I = Subset.front().first; + + // If the next instruction is different, or if there are no other pairs, + // emit a remark for the collated subset. e.g. + // [(load, vf1), (load, vf2))] + // to emit: + // remark: invalid costs for 'load' at VF=(vf, vf2) + if (Subset == Tail || Tail[Subset.size()].first != I) { + std::string OutString; + raw_string_ostream OS(OutString); + assert(!Subset.empty() && "Unexpected empty range"); + OS << "Instruction with invalid costs prevented vectorization at VF=("; + for (auto &Pair : Subset) + OS << (Pair.second == Subset.front().second ? "" : ", ") + << Pair.second; + OS << "):"; + if (auto *CI = dyn_cast(I)) + OS << " call to " << CI->getCalledFunction()->getName(); + else + OS << " " << I->getOpcodeName(); + OS.flush(); + reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); + Tail = Tail.drop_front(Subset.size()); + Subset = {}; + } else + // Grow the subset by one element + Subset = Tail.take_front(Subset.size() + 1); + } while (!Tail.empty()); + } + if (!EnableCondStoresVectorization && NumPredStores) { reportVectorizationFailure("There are conditional stores.", "store that is conditionally executed prevents vectorization", @@ -6884,7 +6939,8 @@ } LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(ElementCount VF) { +LoopVectorizationCostModel::expectedCost( + ElementCount VF, SmallVectorImpl *Invalid) { VectorizationCostTy Cost; // For each block. @@ -6904,6 +6960,10 @@ if (ForceTargetInstructionCost.getNumOccurrences() > 0) C.first = InstructionCost(ForceTargetInstructionCost); + // Keep a list of instructions with invalid costs. + if (Invalid && !C.first.isValid()) + Invalid->emplace_back(&I, VF); + BlockCost.first += C.first; BlockCost.second |= C.second; LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -1,4 +1,6 @@ -; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on \ +; RUN: -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) { ; CHECK-LABEL: @vec_load @@ -95,6 +97,10 @@ ret void } +; CHECK-REMARKS: UserVF ignored because of invalid costs. +; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load +; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32 +; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { ; CHECK: @vec_sin_no_mapping ; CHECK: call fast <2 x float> @llvm.sin.v2f32 @@ -105,10 +111,10 @@ for.body: ; preds = %entry, %for.body %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 - %0 = load float, float* %arrayidx, align 4 - %1 = tail call fast float @llvm.sin.f32(float %0) + %0 = load float, float* %arrayidx, align 4, !dbg !11 + %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12 %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 - store float %1, float* %arrayidx1, align 4 + store float %1, float* %arrayidx1, align 4, !dbg !13 %inc = add nuw nsw i64 %i.07, 1 %exitcond.not = icmp eq i64 %inc, %n br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 @@ -117,6 +123,10 @@ ret void } +; CHECK-REMARKS: UserVF ignored because of invalid costs. +; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load +; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32 +; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { ; CHECK: @vec_sin_fixed_mapping ; CHECK: call fast <2 x float> @llvm.sin.v2f32 @@ -127,10 +137,10 @@ for.body: ; preds = %entry, %for.body %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 - %0 = load float, float* %arrayidx, align 4 - %1 = tail call fast float @llvm.sin.f32(float %0) #3 + %0 = load float, float* %arrayidx, align 4, !dbg !11 + %1 = tail call fast float @llvm.sin.f32(float %0) #3, !dbg !12 %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 - store float %1, float* %arrayidx1, align 4 + store float %1, float* %arrayidx1, align 4, !dbg !13 %inc = add nuw nsw i64 %i.07, 1 %exitcond.not = icmp eq i64 %inc, %n br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 @@ -183,3 +193,18 @@ !1 = distinct !{!1, !2, !3} !2 = !{!"llvm.loop.vectorize.width", i32 2} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} + +!llvm.dbg.cu = !{!4} +!llvm.module.flags = !{!7} +!llvm.ident = !{!8} + +!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !5, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !6, splitDebugInlining: false, nameTableKind: None) +!5 = !DIFile(filename: "t.c", directory: "somedir") +!6 = !{} +!7 = !{i32 2, !"Debug Info Version", i32 3} +!8 = !{!"clang"} +!9 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 2, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6) +!10 = !DISubroutineType(types: !6) +!11 = !DILocation(line: 3, column: 10, scope: !9) +!12 = !DILocation(line: 3, column: 20, scope: !9) +!13 = !DILocation(line: 3, column: 30, scope: !9)