Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -573,6 +573,8 @@ /// Don't allow loop unrolling to simulate more than this number of /// iterations when checking full unroll profitability unsigned MaxIterationsCountToAnalyze; + /// Enable runtime unrolling when vectorizing the epilogue loop. + bool unrollLoopEpilogues = false; }; /// Get target-customized preferences for the generic loop unrolling Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -113,6 +113,9 @@ // manipulations in average. UP.BEInsns += 3; + // We want to run unroll for epilog loops. + UP.unrollLoopEpilogues = true; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7778,7 +7778,10 @@ LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } - AddRuntimeUnrollDisableMetaData(L); + TargetTransformInfo::UnrollingPreferences UP; + TTI->getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.unrollLoopEpilogues) + AddRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses.