Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -69,6 +69,11 @@ cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(150), cl::Hidden); +static cl::opt UnrollRuntimeLocal( + "amdgpu-unroll-runtime-local", + cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), + cl::init(true), cl::Hidden); + static cl::opt UseLegacyDA( "amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), @@ -177,6 +182,9 @@ (!isa(GEP->getPointerOperand()) && !isa(GEP->getPointerOperand()))) continue; + LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n" + << *L << " due to LDS use.\n"); + UP.Runtime = UnrollRuntimeLocal; } // Check if GEP depends on a value defined by this loop itself. Index: llvm/test/CodeGen/AMDGPU/unroll.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/unroll.ll +++ llvm/test/CodeGen/AMDGPU/unroll.ll @@ -99,3 +99,37 @@ for.end: ; preds = %for.cond ret void } + +; Check that runtime unroll is enabled for local memory references + +; CHECK-LABEL: @local_memory_runtime +; CHECK: loop.header: +; CHECK: load i32, i32 addrspace(3)* +; CHECK: load i32, i32 addrspace(3)* +; CHECK: br i1 +; CHECK: loop.header.epil +; CHECK: load i32, i32 addrspace(3)* +; CHECK: ret +define amdgpu_kernel void @local_memory_runtime(i32 addrspace(1)* %out, i32 addrspace(3)* %lds, i32 %n) { +entry: + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter + %val = load i32, i32 addrspace(3)* %ptr_lds + %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter + store i32 %val, i32 addrspace(1)* %ptr_out + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %cond = icmp sge i32 %counter, %n + br i1 %cond, label %exit, label %loop.header + +exit: + ret void +}