Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1852,7 +1852,7 @@ /// \return An upper bound for the vectorization factor, or None if /// vectorization should be avoided up front. - Optional computeMaxVF(bool OptForSize); + Optional computeMaxVF(bool OptForSize, bool OptForDivergent); /// Information about vectorization costs struct VectorizationFactor { @@ -2218,6 +2218,7 @@ /// Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, + bool OptForDivergent, unsigned UserVF); /// Generate the IR code for the vectorized loop. @@ -6270,7 +6271,8 @@ } } -Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { +Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize, + bool OptForDivergent) { if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { ORE->emit(createMissedAnalysis("ConditionalStore") << "store that is conditionally executed prevents vectorization"); @@ -6278,29 +6280,29 @@ return None; } - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { - // TODO: It may by useful to do since it's still likely to be dynamically - // uniform if the target can skip. - DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + if (Legal->getRuntimePointerChecking()->Need) { + if (OptForSize) { + ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") + << "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz"); + DEBUG(dbgs() + << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); + + return None; + } - ORE->emit( - createMissedAnalysis("CantVersionLoopWithDivergentTarget") - << "runtime pointer checks needed. Not enabled for divergent target"); + if (OptForDivergent) { + // TODO: It may by useful to do since it's still likely to be dynamically + // uniform if the target can skip. + DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target"); - return None; - } - - if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize); + ORE->emit( + createMissedAnalysis("CantVersionLoopWithDivergentTarget") + << "runtime pointer checks needed. Not enabled for divergent target"); - if (Legal->getRuntimePointerChecking()->Need) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - DEBUG(dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return None; + return None; + } } // If we optimize the program for size, avoid creating the tail loop. @@ -6308,17 +6310,17 @@ DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); // If we don't know the precise trip count, don't try to vectorize. - if (TC < 2) { + if (TC < 2 && (OptForSize || OptForDivergent)) { ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + createMissedAnalysis("UnknownLoopCountComplexCFG") + << "unable to calculate the loop count due to complex control flow"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with " + << (OptForSize ? "-Os/-Oz.\n" : "divergent target.\n")); return None; } unsigned MaxVF = computeFeasibleMaxVF(OptForSize); - - if (TC % MaxVF != 0) { + if (OptForSize && TC % MaxVF != 0) { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. // FIXME: look for a smaller MaxVF that does divide TC rather than give up. @@ -6327,9 +6329,9 @@ ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") << "cannot optimize for size and vectorize at the " - "same time. Enable vectorization of this loop " - "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os/-Oz"); + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os/-Oz"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return None; } @@ -7609,12 +7611,13 @@ } LoopVectorizationCostModel::VectorizationFactor -LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { +LoopVectorizationPlanner::plan(bool OptForSize, bool OptForDivergent, + unsigned UserVF) { // Width 1 means no vectorize, cost 0 means uncomputed cost. const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U, 0U}; - Optional MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional MaybeMaxVF = CM.computeMaxVF(OptForSize, OptForDivergent); if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. return NoVectorization; @@ -7829,6 +7832,10 @@ bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + bool OptForDivergent = + Hints.getForce() != LoopVectorizeHints::FK_Enabled && + TTI->hasBranchDivergence(); + // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); @@ -7898,7 +7905,7 @@ // Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor VF = - LVP.plan(OptForSize, UserVF); + LVP.plan(OptForSize, OptForDivergent, UserVF); // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); Index: test/Transforms/LoopVectorize/AMDGPU/divergent-loop-bounds.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AMDGPU/divergent-loop-bounds.ll @@ -0,0 +1,183 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -pass-remarks-analysis='loop-vectorize' < %s 2>&1 | FileCheck -check-prefixes=GFX9-REMARK %s + +; It may make sense to vectorize this if the condition is uniform, but +; assume that it isn't for now. + +; GCN-LABEL: @small_loop_i16_unknown_uniform_size( +; GCN: load i16 +; GCN: add nsw i16 +; GCN: store i16 +; GCN: br i1 %cond + +; GFX9-REMARK: remark: :0:0: loop not vectorized: unable to calculate the loop count due to complex control flow +define amdgpu_kernel void @small_loop_i16_unknown_uniform_size(i16 addrspace(1)* nocapture %inArray, i16 %size) #0 { +entry: + %cmp = icmp sgt i16 %size, 0 + br i1 %cmp, label %loop, label %exit + +loop: ; preds = %entry, %loop + %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv + %load = load i16, i16 addrspace(1)* %gep, align 2 + %add = add nsw i16 %load, 6 + store i16 %add, i16 addrspace(1)* %gep, align 2 + %iv1 = add i16 %iv, 1 + %cond = icmp eq i16 %iv1, %size + br i1 %cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} + +; GCN-LABEL: @small_loop_i16_unknown_divergent_size( +; GCN: load i16 +; GCN: add nsw i16 +; GCN: store i16 +; GCN: br i1 %cond + +; GFX9-REMARK: remark: :0:0: loop not vectorized: unable to calculate the loop count due to complex control flow +define amdgpu_kernel void @small_loop_i16_unknown_divergent_size(i16 addrspace(1)* nocapture %inArray, i16 addrspace(1)* %size.ptr) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %size.gep = getelementptr inbounds i16, i16 addrspace(1)* %size.ptr, i32 %tid + %size = load i16, i16 addrspace(1)* %size.gep + %cmp = icmp sgt i16 %size, 0 + br i1 %cmp, label %loop, label %exit + +loop: ; preds = %entry, %loop + %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv + %load = load i16, i16 addrspace(1)* %gep, align 2 + %add = add nsw i16 %load, 6 + store i16 %add, i16 addrspace(1)* %gep, align 2 + %iv1 = add i16 %iv, 1 + %cond = icmp eq i16 %iv1, %size + br i1 %cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} + +; This loop will be vectorized as the trip count is below the +; threshold and no scalar iterations are needed. + +; GCN-LABEL: @small_loop_i16_256( +; GFX9: load <2 x i16> +; GFX9: add nsw <2 x i16> +; GFX9: store <2 x i16> +; GFX9: add i32 %index, 2 +; GFX9: br i1 + +; VI-NOT: <2 x i16> +define amdgpu_kernel void @small_loop_i16_256(i16 addrspace(1)* nocapture %inArray) #0 { +entry: + br label %loop + +loop: ; preds = %entry, %loop + %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv + %load = load i16, i16 addrspace(1)* %gep, align 2 + %add = add nsw i16 %load, 6 + store i16 %add, i16 addrspace(1)* %gep, align 2 + %iv1 = add i16 %iv, 1 + %cond = icmp eq i16 %iv1, 127 + br i1 %cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} + +; Not divisible by vectorize factor of 2 +; GCN-LABEL: @small_loop_i16_255( +; GFX9: load <2 x i16> +; GFX9: add nsw <2 x i16> +; GFX9: store <2 x i16> +; GFX9: add i32 %index, 2 +; GFX9: br i1 + +; VI-NOT: <2 x i16> +define amdgpu_kernel void @small_loop_i16_255(i16 addrspace(1)* nocapture %inArray) #0 { +entry: + br label %loop + +loop: ; preds = %entry, %loop + %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv + %load = load i16, i16 addrspace(1)* %gep, align 2 + %add = add nsw i16 %load, 6 + store i16 %add, i16 addrspace(1)* %gep, align 2 + %iv1 = add i16 %iv, 1 + %cond = icmp eq i16 %iv1, 127 + br i1 %cond, label %exit, label %loop + +exit: ; preds = %loop, %entry + ret void +} + +; Metadata indicates it should be vectorized even though it may be +; divergent. +; GCN-LABEL: @small_loop_i16_unknown_uniform_size_forced( +; GCN: load <2 x i16> +; GCN: add nsw <2 x i16> +; GCN: store <2 x i16> +; GCN: add i32 %index, 2 +; GCN: br i1 +define amdgpu_kernel void @small_loop_i16_unknown_uniform_size_forced(i16 addrspace(1)* nocapture %inArray, i32 %size) #0 { +entry: + %cmp = icmp sgt i32 %size, 0 + br i1 %cmp, label %loop, label %exit + +loop: ; preds = %loop, %entry + %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i32 %iv + %load = load i16, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2 + %add = add nsw i16 %load, 6 + store i16 %add, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2 + %iv1 = add i32 %iv, 1 + %cond = icmp eq i32 %iv1, %size + br i1 %cond, label %exit, label %loop, !llvm.loop !2 + +exit: ; preds = %loop, %entry + ret void +} + +; GCN-LABEL: @small_loop_i16_unknown_divergent_size_forced( +; GCN: load <2 x i16> +; GCN: add nsw <2 x i16> +; GCN: store <2 x i16> +; GCN: add i32 %index, 2 +; GCN: br i1 +define amdgpu_kernel void @small_loop_i16_unknown_divergent_size_forced(i16 addrspace(1)* nocapture %inArray, i16 addrspace(1)* %size.ptr) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %size.gep = getelementptr inbounds i16, i16 addrspace(1)* %size.ptr, i32 %tid + %size = load i16, i16 addrspace(1)* %size.gep + %cmp = icmp sgt i16 %size, 0 + br i1 %cmp, label %loop, label %exit + +loop: ; preds = %loop, %entry + %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv + %load = load i16, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2 + %add = add nsw i16 %load, 6 + store i16 %add, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2 + %iv1 = add i16 %iv, 1 + %cond = icmp eq i16 %iv1, %size + br i1 %cond, label %exit, label %loop, !llvm.loop !2 + +exit: ; preds = %loop, %entry + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } + +!0 = distinct !{!0} +!1 = distinct !{!1} +!2 = distinct !{!2, !3, !4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} \ No newline at end of file