diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1285,6 +1285,12 @@ /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + // Pseudo probe needs to be duplicated for each unrolled iteration and + // vector lane so that profiled loop trip count can be accurately + // accumulated instead of being under counted. + if (isa(I)) + return false; + if (VF.isScalar()) return true; @@ -8939,7 +8945,7 @@ // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. - for (Instruction &I : BB->instructionsWithoutDebug()) { + for (Instruction &I : BB->instructionsWithoutDebug(false)) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Function Attrs: nounwind uwtable +define i32 @test1(ptr nocapture %a, ptr nocapture readonly %b) #0 { +entry: + call void @llvm.pseudoprobe(i64 3666282617048535130, i64 1, i32 0, i64 -1) + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4, !tbaa !1 + %conv = fptosi float %0 to i32 + %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %conv, ptr %arrayidx2, align 4, !tbaa !5 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + call void @llvm.pseudoprobe(i64 3666282617048535130, i64 2, i32 0, i64 -1) + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + call void @llvm.pseudoprobe(i64 3666282617048535130, i64 3, i32 0, i64 -1) + ret i32 0 +} + + +; CHECK-LABEL: @test1 +; CHECK: vector.body: +; CHECK: load <4 x float>, ptr %{{.*}} +; CHECK: store <4 x i32> %{{.*}}, ptr %{{.*}} +; CHECK-COUNT-4: call void @llvm.pseudoprobe(i64 3666282617048535130, i64 2, i32 0, i64 -1) +; CHECK: %index.next = add nuw i64 %index, 4 + + + + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1 + +attributes #0 = { nounwind uwtable } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } + +!llvm.pseudo_probe_desc = !{!0} + +!0 = !{i64 3666282617048535130, i64 52824598631, !"test1"} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !3, i64 0}