Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -653,9 +653,28 @@ case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_icmp: case Intrinsic::amdgcn_fcmp: + case Intrinsic::amdgcn_if_break: return true; } } + + if (const ExtractValueInst *ExtValue = dyn_cast(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) + return true; + } + } + } + } + + // FIXME: Should handle inline asm with sgpr output. return false; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10888,72 +10888,10 @@ return RC; } -static bool hasCFUser(const Value *V, SmallPtrSet &Visited) { - if (!isa(V)) - return false; - if (!Visited.insert(V).second) - return false; - bool Result = false; - for (auto U : V->users()) { - if (const IntrinsicInst *Intrinsic = dyn_cast(U)) { - if (V == U->getOperand(1)) { - switch (Intrinsic->getIntrinsicID()) { - default: - Result = false; - break; - case Intrinsic::amdgcn_if_break: - case Intrinsic::amdgcn_if: - case Intrinsic::amdgcn_else: - Result = true; - break; - } - } - if (V == U->getOperand(0)) { - switch (Intrinsic->getIntrinsicID()) { - default: - Result = false; - break; - case Intrinsic::amdgcn_end_cf: - case Intrinsic::amdgcn_loop: - Result = true; - break; - } - } - } else { - Result = hasCFUser(U, Visited); - } - if (Result) - break; - } - return Result; -} - +// FIXME: GCNTTIImpl::isAlwaysUniform should be taught about sgpr constraints +// and this hook removed. bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, const Value *V) const { - if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_if_break: - return true; - } - } - if (const ExtractValueInst *ExtValue = dyn_cast(V)) { - if (const IntrinsicInst *Intrinsic = - dyn_cast(ExtValue->getOperand(0))) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_if: - case Intrinsic::amdgcn_else: { - ArrayRef Indices = ExtValue->getIndices(); - if (Indices.size() == 1 && Indices[0] == 1) { - return true; - } - } - } - } - } if (const CallInst *CI = dyn_cast(V)) { if (isa(CI->getCalledValue())) { const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); @@ -10978,6 +10916,6 @@ } } } - SmallPtrSet Visited; - return hasCFUser(V, Visited); + + return false; } Index: llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -17,43 +17,39 @@ ; CHECK-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x ; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; CHECK-NEXT: s_mov_b64 s[2:3], 0 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5 ; CHECK-NEXT: s_branch BB0_3 ; CHECK-NEXT: BB0_1: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[2:3], s[10:11], s[2:3] -; CHECK-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; CHECK-NEXT: s_and_b64 s[8:9], s[8:9], exec -; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_cbranch_execz BB0_6 ; CHECK-NEXT: BB0_3: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec +; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], exec ; CHECK-NEXT: s_cmp_lt_u32 s0, 32 -; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_mov_b64 s[6:7], -1 ; CHECK-NEXT: s_cbranch_scc0 BB0_2 ; CHECK-NEXT: ; %bb.4: ; %endif1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_mov_b64 s[6:7], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc -; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_1 ; CHECK-NEXT: ; %bb.5: ; %endif2 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_add_i32 s0, s0, 1 -; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 +; CHECK-NEXT: s_xor_b64 s[4:5], exec, -1 ; CHECK-NEXT: s_branch BB0_1 ; CHECK-NEXT: BB0_6: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] ; CHECK-NEXT: ; %bb.7: ; %if1 ; CHECK-NEXT: v_sqrt_f32_e32 v1, v0 ; CHECK-NEXT: ; %bb.8: ; %endloop Index: llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -7,44 +7,40 @@ ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: ; implicit-def: $sgpr10_sgpr11 ; SI-NEXT: s_branch BB0_3 ; SI-NEXT: BB0_1: ; %Flow1 ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_or_b64 exec, exec, s[12:13] ; SI-NEXT: BB0_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_and_b64 s[14:15], exec, s[10:11] -; SI-NEXT: s_or_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_andn2_b64 s[8:9], s[8:9], exec -; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec -; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], exec, s[8:9] +; SI-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5] ; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz BB0_6 ; SI-NEXT: BB0_3: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_or_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_or_b64 s[8:9], s[8:9], exec ; SI-NEXT: s_cmp_gt_u32 s6, 3 -; SI-NEXT: v_cmp_lt_u32_e64 s[12:13], s6, 4 +; SI-NEXT: v_cmp_lt_u32_e64 s[10:11], s6, 4 ; SI-NEXT: s_cbranch_scc1 BB0_2 ; SI-NEXT: ; %bb.4: ; %mid.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen -; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 -; SI-NEXT: s_mov_b64 s[10:11], -1 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_mov_b64 s[8:9], -1 +; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SI-NEXT: s_cbranch_execz BB0_1 ; SI-NEXT: ; %bb.5: ; %end.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_add_i32 s6, s6, 1 -; SI-NEXT: s_xor_b64 s[10:11], exec, -1 +; SI-NEXT: s_xor_b64 s[8:9], exec, -1 ; SI-NEXT: s_branch BB0_1 ; SI-NEXT: BB0_6: ; %for.end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[0:1], s[10:11] ; SI-NEXT: s_cbranch_execz BB0_8 ; SI-NEXT: ; %bb.7: ; %if ; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -188,15 +188,11 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: s_branch BB1_2 ; GCN-NEXT: BB1_1: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_and_b64 s[6:7], exec, s[6:7] ; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], exec -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execz BB1_9 ; GCN-NEXT: BB1_2: ; %bb1 @@ -211,23 +207,23 @@ ; GCN-NEXT: s_mov_b64 s[6:7], -1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_vccz BB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 -; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_orn2_b64 s[6:7], vcc, exec ; GCN-NEXT: BB1_5: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[10:11], 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[10:11] +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[8:9] ; GCN-NEXT: s_cbranch_vccz BB1_1 ; GCN-NEXT: s_branch BB1_7 ; GCN-NEXT: BB1_6: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_mov_b64 s[6:7], -1 ; GCN-NEXT: s_and_b64 vcc, exec, -1 ; GCN-NEXT: s_cbranch_execz BB1_1 @@ -235,17 +231,17 @@ ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_cbranch_vccz BB1_1 ; GCN-NEXT: ; %bb.8: ; %case0 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_and_b64 s[10:11], vcc, exec -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GCN-NEXT: s_and_b64 s[8:9], vcc, exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GCN-NEXT: s_branch BB1_1 ; GCN-NEXT: BB1_9: ; %Flow6 ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] Index: llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -98,27 +98,28 @@ ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xc +; SI-NEXT: ; implicit-def: $sgpr0 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SI-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s6, s2, s3 +; SI-NEXT: s_add_i32 s0, s10, s11 ; SI-NEXT: BB2_2: ; %Flow -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] ; SI-NEXT: s_xor_b64 exec, exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB2_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_add_i32 s0, s0, s1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ; %bb.4: ; %endif +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s0, s8, s9 +; SI-NEXT: BB2_4: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/smrd.ll +++ llvm/test/CodeGen/AMDGPU/smrd.ll @@ -411,7 +411,7 @@ ; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted ; GCN: v_readfirstlane -define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32) #0 { +define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(4)* noalias dereferenceable(18446744073709551615), i32) #0 { main_body: %descptr = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0 br label %.outer_loop_header