Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -922,6 +922,7 @@ def S_ENDPGM : SOPP <0x00000001, (ins EndpgmImm:$simm16), "s_endpgm $simm16"> { let isBarrier = 1; let isReturn = 1; + let isConvergent = 1; } def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> { @@ -929,14 +930,15 @@ let simm16 = 0; let isBarrier = 1; let isReturn = 1; + let isConvergent = 1; } -let SubtargetPredicate = isGFX9Plus in { +let SubtargetPredicate = isGFX9Plus, isConvergent = 1 in { let isBarrier = 1, isReturn = 1, simm16 = 0 in { def S_ENDPGM_ORDERED_PS_DONE : SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">; } // End isBarrier = 1, isReturn = 1, simm16 = 0 -} // End SubtargetPredicate = isGFX9Plus +} // End SubtargetPredicate = isGFX9Plus, isConvergent = 1 let SubtargetPredicate = isGFX10Plus in { let isBarrier = 1, isReturn = 1, simm16 = 0 in { Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -2168,15 +2168,15 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] ; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] -; GCN: s_cbranch_scc0 - -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] -; GCN: s_endpgm +; GCN: s_cbranch_scc1 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] ; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] +; GCN: s_endpgm + define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/tail-duplication-convergent.ll =================================================================== --- test/CodeGen/AMDGPU/tail-duplication-convergent.ll +++ test/CodeGen/AMDGPU/tail-duplication-convergent.ll @@ -100,6 +100,30 @@ ret void } +; s_endpgm shouldn't be duplicated. + +; GCN-LABEL: {{^}}taildup_ret: +; GCN: s_endpgm +; GCN-NOT: s_endpgm +define amdgpu_kernel void @taildup_ret(i32 addrspace(1)* %a, i1 %cond) #0 { +entry: + br i1 %cond, label %bb1, label %bb2 + +bb1: + store i32 0, i32 addrspace(1)* %a + br label %call + +bb2: + store i32 1, i32 addrspace(1)* %a + br label %call + +call: + br label %ret + +ret: + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind convergent } Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -169,8 +169,6 @@ ; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN: buffer_store_dword [[TWO]] -; GCN: s_endpgm ; GCN: {{^}}[[IF_LABEL]]: ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 @@ -252,11 +250,9 @@ ; GCN: s_cmp_lt_i32 s[[COND0]], 1 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]] ; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}} -; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]] -; GCN: {{^}}[[EXIT]]: -; GCN: s_endpgm -; GCN: {{^}}[[BODY]]: +; GCN: s_cbranch_vccnz [[EXIT:[A-Za-z0-9_]+]] ; GCN: buffer_store +; GCN: {{^}}[[EXIT]]: ; GCN: s_endpgm define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) { bb: @@ -304,11 +300,11 @@ ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}} ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] -; GCN: s_endpgm -; GCN: {{^}}[[IF_UNIFORM_LABEL]]: +; GCN: s_cbranch_scc1 [[ENDIF_UNIFORM_LABEL:[A-Z0-9_a-z]+]] ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; GCN: buffer_store_dword [[ONE]] +; GCN: {{^}}[[ENDIF_UNIFORM_LABEL]]: +; GCN: s_endpgm define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -364,11 +360,11 @@ ; GCN: buffer_store_dword [[ONE]] ; GCN: s_or_b64 exec, exec, [[MASK]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]] -; GCN: s_endpgm -; GCN: [[IF_UNIFORM]]: +; GCN: s_cbranch_scc1 [[ENDIF_UNIFORM:[A-Z0-9_]+]] ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; GCN: buffer_store_dword [[TWO]] +; GCN: [[ENDIF_UNIFORM]]: +; GCN: s_endpgm define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0