Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -534,6 +534,52 @@ for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { + if (MI.isCall()) { + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + + const MachineOperand *CalleeOp + = TII->getNamedOperand(MI, AMDGPU::OpName::callee); + const Function *Callee = cast(CalleeOp->getGlobal()); + if (Callee->isDeclaration()) { + // If this is a call to an external function, we can't do much. Make + // conservative guesses. + + // 48 SGPRs - vcc, - flat_scr, -xnack + int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, + ST.hasFlatAddressSpace()); + MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); + MaxVGPR = std::max(MaxVGPR, 23); + + CalleeFrameSize = std::max(CalleeFrameSize, 16384u); + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); + assert(I != CallGraphResourceInfo.end() && + "callee should have been handled before caller"); + + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + CalleeFrameSize + = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + } + + if (!Callee->doesNotRecurse()) + Info.HasRecursion = true; + + // Skip the standard operand checks with a call. We have operands like + // SP (which may not have really been used in the callee). + continue; + } + // TODO: Check regmasks? Do they occur anywhere except calls? for (const MachineOperand &MO : MI.operands()) { unsigned Width = 0; @@ -630,48 +676,6 @@ MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; } } - - if (MI.isCall()) { - // Pseudo used just to encode the underlying global. Is there a better - // way to track this? - - const MachineOperand *CalleeOp - = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - const Function *Callee = cast(CalleeOp->getGlobal()); - if (Callee->isDeclaration()) { - // If this is a call to an external function, we can't do much. Make - // conservative guesses. - - // 48 SGPRs - vcc, - flat_scr, -xnack - int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, - ST.hasFlatAddressSpace()); - MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); - MaxVGPR = std::max(MaxVGPR, 23); - - CalleeFrameSize = std::max(CalleeFrameSize, 16384u); - Info.UsesVCC = true; - Info.UsesFlatScratch = ST.hasFlatAddressSpace(); - Info.HasDynamicallySizedStack = true; - } else { - // We force CodeGen to run in SCC order, so the callee's register - // usage etc. should be the cumulative usage of all callees. - auto I = CallGraphResourceInfo.find(Callee); - assert(I != CallGraphResourceInfo.end() && - "callee should have been handled before caller"); - - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); - MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); - CalleeFrameSize - = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); - Info.UsesVCC |= I->second.UsesVCC; - Info.UsesFlatScratch |= I->second.UsesFlatScratch; - Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; - Info.HasRecursion |= I->second.HasRecursion; - } - - if (!Callee->doesNotRecurse()) - Info.HasRecursion = true; - } } } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3033,6 +3033,7 @@ } case AMDGPU::SI_CALL_ISEL: case AMDGPU::SI_TCRETURN_ISEL: { + const SIMachineFunctionInfo *Info = MF->getInfo(); const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); @@ -3061,6 +3062,7 @@ for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); + MIB.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MI.eraseFromParent(); return BB; Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -71,8 +71,8 @@ ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+4 ; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA-NEXT: s_mov_b32 s4, s33 -; HSA-NEXT: s_mov_b32 s32, s33 +; HSA: s_mov_b32 s32, s33 +; HSA: s_mov_b32 s4, s33 ; MESA-DAG: s_mov_b32 s4, s33{{$}} ; MESA-DAG: s_mov_b32 s32, s33{{$}} @@ -111,6 +111,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 @@ -118,7 +119,6 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-DAG: s_mov_b32 s4, s33{{$}} -; GCN-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -131,6 +131,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: ; HSA-DAG: s_mov_b32 s33, s9{{$}} ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-DAG: s_mov_b32 s32, s3 ; GCN-DAG: buffer_load_sbyte v0 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} @@ -138,7 +139,6 @@ ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s32, s3 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -152,6 +152,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: ; MESA-DAG: s_mov_b32 s33, s3{{$}} ; HSA-DAG: s_mov_b32 s33, s9{{$}} +; GCN-DAG: s_mov_b32 s32, s33 ; GCN-DAG: buffer_load_ubyte v0 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} @@ -159,7 +160,6 @@ ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -184,6 +184,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-DAG: s_mov_b32 s32, s33 ; GCN-DAG: buffer_load_sshort v0 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} @@ -191,7 +192,6 @@ ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -207,12 +207,12 @@ ; GCN-DAG: buffer_load_ushort v0 +; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 ; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -225,13 +225,13 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: ; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 ; GCN: v_mov_b32_e32 v0, 42 ; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -6,11 +6,11 @@ ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 s33, s7 -; GCN: s_getpc_b64 s[34:35] +; GCN: s_mov_b32 s32, s33 +; GCN-NEXT: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, ; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s[34:35] ; GCN-NEXT: s_mov_b32 s4, s33 @@ -113,13 +113,12 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: ; GCN: s_mov_b32 s34, s9 -; GCN: ; def s33 -; GCN-NEXT: #ASMEND +; GCN-DAG: s_mov_b32 s32, s34 +; GCN-DAG: ; def s33 ; GCN: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s4, s34 -; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s33 @@ -134,13 +133,12 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: ; GCN: s_mov_b32 s33, s9 -; GCN: ; def v32 -; GCN-NEXT: #ASMEND +; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: ; def v32 ; GCN: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v32 @@ -167,11 +165,11 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: ; GCN: s_mov_b32 s33, s7 +; GCN: s_mov_b32 s32, s33 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 ; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -193,9 +193,10 @@ ; GCN-NOT: s6 ; GCN: s_mov_b32 s33, s7 ; GCN-NOT: s6 +; GCN: s_mov_b32 s32, s33 +; GCN-NOT: s6 ; GCN: s_mov_b32 s4, s33 ; GCN-NOT: s6 -; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() @@ -208,9 +209,9 @@ ; GCN: enable_sgpr_workgroup_id_z = 0 ; GCN: s_mov_b32 s33, s8 +; GCN: s_mov_b32 s32, s33 ; GCN: s_mov_b32 s4, s33 ; GCN: s_mov_b32 s6, s7 -; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() @@ -239,10 +240,12 @@ ; GCN: s_mov_b32 s33, s8 ; GCN-NOT: s6 ; GCN-NOT: s7 +; GCN: s_mov_b32 s32, s33 +; GCN-NOT: s6 +; GCN-NOT: s7 ; GCN: s_mov_b32 s4, s33 ; GCN-NOT: s6 ; GCN-NOT: s7 -; GCN: s_mov_b32 s32, s33 ; GCN-NOT: s6 ; GCN-NOT: s7 ; GCN: s_swappc_b64 @@ -262,13 +265,18 @@ ; GCN-NOT: s7 ; GCN-NOT: s8 -; GCN: s_mov_b32 s4, s33 + +; GCN: s_mov_b32 s32, s33 ; GCN-NOT: s6 ; GCN-NOT: s7 ; GCN-NOT: s8 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s4, s33 + +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 ; GCN-NOT: s6 ; GCN-NOT: s7 @@ -286,14 +294,16 @@ ; GCN: enable_sgpr_workgroup_id_z = 1 ; GCN: s_mov_b32 s33, s8 + ; GCN-NOT: s6 ; GCN-NOT: s7 -; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s32, s33 + ; GCN-NOT: s6 ; GCN-NOT: s7 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s4, s33 ; GCN-NOT: s6 ; GCN-NOT: s7 @@ -310,9 +320,9 @@ ; GCN: s_mov_b32 s33, s9 ; GCN: s_mov_b32 s6, s7 +; GCN: s_mov_b32 s32, s33 ; GCN: s_mov_b32 s4, s33 ; GCN: s_mov_b32 s7, s8 -; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { call void @use_workgroup_id_yz() @@ -378,11 +388,11 @@ ; GCN-DAG: s_mov_b32 s33, s7 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN-DAG: s_mov_b32 s32, s33 ; GCN-NOT: s6 ; GCN: s_mov_b32 s4, s33 ; GCN-NOT: s6 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { call void @other_arg_use_workgroup_id_x(i32 555) @@ -396,9 +406,10 @@ ; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN-DAG: s_mov_b32 s32, s33 + ; GCN: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s6, s7 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { call void @other_arg_use_workgroup_id_y(i32 555) @@ -412,10 +423,10 @@ ; GCN: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s6, s7 -; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 { call void @other_arg_use_workgroup_id_z(i32 555) @@ -480,8 +491,8 @@ ; GCN: s_mov_b64 s[10:11], s[8:9] ; GCN: s_mov_b64 s[8:9], s[6:7] ; GCN: s_mov_b64 s[6:7], s[4:5] -; GCN: s_mov_b32 s4, s33 ; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s4, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { call void @use_every_sgpr_input() Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -62,10 +62,10 @@ ; HSA-NOENV: kernarg_segment_byte_size = 112 ; MESA: kernarg_segment_byte_size = 464 -; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x1c0 +; HSA: s_add_u32 s6, s6, 0x70 +; MESA: s_add_u32 s6, s6, 0x1c0 -; GCN: s_addc_u32 s7, s5, 0{{$}} +; GCN: s_addc_u32 s7, s7, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { call void @func_implicitarg_ptr()