diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -573,6 +573,8 @@ /// Don't allow loop unrolling to simulate more than this number of /// iterations when checking full unroll profitability unsigned MaxIterationsCountToAnalyze; + /// Don't disable runtime unroll for the loops which were vectorized. + bool UnrollVectorizedLoop = false; }; /// Get target-customized preferences for the generic loop unrolling diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -113,6 +113,9 @@ // manipulations in average. UP.BEInsns += 3; + // We want to run unroll even for the loops which have been vectorized. + UP.UnrollVectorizedLoop = true; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7778,7 +7778,10 @@ LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } - AddRuntimeUnrollDisableMetaData(L); + TargetTransformInfo::UnrollingPreferences UP; + TTI->getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) + AddRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-unroll-metadata.ll b/llvm/test/CodeGen/AMDGPU/vectorize-unroll-metadata.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vectorize-unroll-metadata.ll @@ -0,0 +1,41 @@ +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx90a -passes=loop-vectorize %s -S -o - | FileCheck %s + +; CHECK-LABEL: @test +; CHECK-LABEL: vector.body: +; CHECK: br i1 %{{[0-9]+}}, label %middle.block, label %vector.body, !llvm.loop !0 +; CHECK-LABEL: middle.block: +; CHECK-LABEL: scalar.ph: +; CHECK-LABEL: loop.header: +; CHECK-LABEL: loop.body: +; CHECK-LABEL: loop.inc: +; CHECK: br i1 %cond, label %exit, label %loop.header, !llvm.loop !2 +; CHECK: !0 = distinct !{!0, !1} +; CHECK: !1 = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: !2 = distinct !{!2, !3, !1} +; CHECK: !3 = !{!"llvm.loop.unroll.runtime.disable"} + + +define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %n) { +entry: + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr_lds = getelementptr i32, ptr addrspace(3) %lds, i32 %counter + %val = load i32, ptr addrspace(3) %ptr_lds + %ptr_out = getelementptr i32, ptr addrspace(1) %out, i32 %counter + store i32 %val, ptr addrspace(1) %ptr_out + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %cond = icmp sge i32 %counter, %n + br i1 %cond, label %exit, label %loop.header + +exit: + ret void +} +