Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10976,30 +10976,19 @@ case ISD::CopyFromReg: { const RegisterSDNode *R = cast(N->getOperand(1)); - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); Register Reg = R->getReg(); - if (Reg.isPhysical()) - return !TRI.isSGPRReg(MRI, Reg); - - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (!TRI.isSGPRReg(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - return false; - } - const Value *V = FLI->getValueFromVirtualReg(Reg); - if (V) + + // FIXME: Why does this need to consider isLiveIn? + if (Reg.isPhysical() || MRI.isLiveIn(Reg)) + return !TRI->isSGPRReg(MRI, Reg); + + if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) return KDA->isDivergent(V); + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); - return !TRI.isSGPRReg(MRI, Reg); + return !TRI->isSGPRReg(MRI, Reg); } break; case ISD::LOAD: { Index: llvm/test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -40,6 +40,31 @@ ret void } +; Test handling inside a non-kernel +; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func: +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0 + +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 +; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] + +; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base +; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc +; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc + +; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] +define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(3)* %ptr to i32* + store volatile i32 7, i32* %stof + ret void +} + ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 0 Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -2,9 +2,7 @@ ; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5] define hidden void @use_dispatch_ptr() #1 { %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* @@ -13,9 +11,7 @@ } ; GCN-LABEL: {{^}}use_queue_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[6:7] define hidden void @use_queue_ptr() #1 { %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* @@ -34,9 +30,7 @@ } ; GCN-LABEL: {{^}}use_implicitarg_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[8:9] define hidden void @use_implicitarg_ptr() #1 { %implicit.arg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %implicit.arg.ptr to i32 addrspace(4)* @@ -198,15 +192,9 @@ ; GCN-LABEL: {{^}}use_every_sgpr_input: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5] +; GCN: s_load_dword s{{[0-9]+}}, s[6:7] +; GCN: s_load_dword s{{[0-9]+}}, s[8:9] ; GCN: ; use s[10:11] ; GCN: ; use s12 ; GCN: ; use s13 Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -2,9 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5] define hidden void @use_dispatch_ptr() #1 { %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* @@ -23,9 +21,7 @@ } ; GCN-LABEL: {{^}}use_queue_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5] define hidden void @use_queue_ptr() #1 { %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* @@ -44,10 +40,10 @@ } ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: -; CIVI: flat_load_dword v[[HI:[0-9]+]], v[0:1] +; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x10 ; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16 -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} ; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}} define hidden void @use_queue_ptr_addrspacecast() #1 { @@ -401,15 +397,10 @@ ; GCN-LABEL: {{^}}use_every_sgpr_input: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5] +; GCN: s_load_dword s{{[0-9]+}}, s[6:7] +; GCN: s_load_dword s{{[0-9]+}}, s[8:9] + ; GCN: ; use s[10:11] ; GCN: ; use s12 ; GCN: ; use s13 @@ -551,15 +542,9 @@ ; GCN: s_swappc_b64 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} -; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]] -; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]] -; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}} -; GCN-DAG: v_mov_b32_e32 v[[LO2:[0-9]+]], s[[LO_Y]] -; GCN-DAG: v_mov_b32_e32 v[[HI2:[0-9]+]], s[[HI_Y]] -; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO2]]:[[HI2]]{{\]}} -; GCN-DAG: v_mov_b32_e32 v[[LO3:[0-9]+]], s[[LO_Z]] -; GCN-DAG: v_mov_b32_e32 v[[HI3:[0-9]+]], s[[HI_Z]] -; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO3]]:[[HI3]]{{\]}} +; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_X]]:[[HI_X]]{{\]}}, 0x0 +; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_Y]]:[[HI_Y]]{{\]}}, 0x0 +; GCN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO_Z]]:[[HI_Z]]{{\]}}, 0x0 ; GCN: ; use ; GCN: ; use [[SAVE_X]] ; GCN: ; use [[SAVE_Y]] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -59,12 +59,7 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: v_mov_b32_e32 v0, s4 -; MESA: v_mov_b32_e32 v1, s5 -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: v_mov_b32_e32 v0, s4 -; HSA: v_mov_b32_e32 v1, s5 -; HSA: flat_load_dword v0, v[0:1] +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #0 { @@ -76,12 +71,7 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: v_mov_b32_e32 v0, s4 -; MESA: v_mov_b32_e32 v1, s5 -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: v_mov_b32_e32 v0, s4 -; HSA: v_mov_b32_e32 v1, s5 -; HSA: flat_load_dword v0, v[0:1] +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @opencl_func_implicitarg_ptr() #0 { @@ -165,16 +155,10 @@ ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; GCN-DAG: v_mov_b32_e32 v0, s4 -; GCN-DAG: v_mov_b32_e32 v1, s5 ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 - -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 - -; HSA: flat_load_dword v0, v[0:1] - -; GCN: s_waitcnt vmcnt(0) +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_waitcnt lgkmcnt(0) define void @func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() @@ -187,15 +171,10 @@ ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; GCN-DAG: v_mov_b32_e32 v0, s4 -; GCN-DAG: v_mov_b32_e32 v1, s5 ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 - -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: flat_load_dword v0, v[0:1] - -; GCN: s_waitcnt vmcnt(0) +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_waitcnt lgkmcnt(0) define void @opencl_func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()