diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -567,8 +567,10 @@ SavedMutations.swap(Mutations); for (auto Region : Regions) { - if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) + if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) { + ++RegionIdx; continue; + } RegionBegin = Region.first; RegionEnd = Region.second; diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll @@ -3,6 +3,9 @@ ; Interleave loads and stores to fit into 9 VGPR limit. ; This requires to avoid load/store clustering. +; Reschedule the second scheduling region without clustering while +; the first region is skipped. + ; GCN: global_load_dwordx4 ; GCN: global_store_dwordx4 ; GCN: global_load_dwordx4 @@ -12,10 +15,13 @@ ; GCN: NumVgprs: {{[0-9]$}} ; GCN: ScratchSize: 0{{$}} -define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 { +define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1, i1 %cnd) #1 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id + br i1 %cnd, label %bb1, label %bb2 + +bb1: %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1 %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4 %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3 @@ -27,6 +33,9 @@ store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4 %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5 store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4 + br label %bb2 + +bb2: ret void }