Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -317,8 +317,11 @@ MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); assert(FIOp && FIOp->isFI() && "frame index must be address operand"); - assert(TII->isMUBUF(MI)); + assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == + MF->getInfo()->getFrameOffsetReg() && + "should only be seeing frame offset relative FrameIndex"); + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -978,12 +981,72 @@ } default: { - if (TII->isMUBUF(*MI)) { + const DebugLoc &DL = MI->getDebugLoc(); + bool IsMUBUF = TII->isMUBUF(*MI); + + if (!IsMUBUF && + MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + // Convert to an absolute stack address by finding the offset from the + // scratch wave base and scaling by the wave size. + // + // In an entry function/kernel the stack address is already the absolute + // address relative to the the scratch wave offset. + + unsigned DiffReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; + unsigned ResultReg = IsCopy ? + MI->getOperand(0).getReg() : + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) + .addReg(MFI->getFrameOffsetReg()) + .addReg(MFI->getScratchWaveOffsetReg()); + + int64_t Offset = FrameInfo.getObjectOffset(Index); + if (Offset == 0) { + // XXX - This never happens because of emergency scavenging slot at 0? + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) + .addImm(Log2_32(ST.getWavefrontSize())) + .addReg(DiffReg); + } else { + unsigned CarryOut + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned ScaledReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // XXX - Should this use a vector shift? + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(Log2_32(ST.getWavefrontSize())); + + // TODO: Fold if use instruction is another add of a constant. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) + .addReg(CarryOut, RegState::Define | RegState::Dead) + .addImm(Offset) + .addReg(ScaledReg, RegState::Kill); + + MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC); + } + + // Don't introduce an extra copy if we're just materializing in a mov. + if (IsCopy) + MI->eraseFromParent(); + else + FIOp.ChangeToRegister(ResultReg, false, false, true); + return; + } + + if (IsMUBUF) { // Disable offen so we don't need a 0 vgpr base. assert(static_cast(FIOperandNum) == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() + == MFI->getFrameOffsetReg()); + int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); @@ -992,17 +1055,19 @@ if (isUInt<12>(NewOffset) && buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); - break; + return; } } + // If the offset is simply too big, don't convert to a scratch wave offset + // relative index. + int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -0,0 +1,72 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Test that non-entry function frame indices are expanded properly to +; give an index relative to the scratch wave offset register + +; Materialize into a mov. Make sure there isn't an unnecessary copy. +; GCN-LABEL: {{^}}func_mov_fi_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: s_sub_u32 vcc_hi, s6, s4 +; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 +; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_mov_fi_i32() #0 { + %alloca = alloca i32 + store volatile i32* %alloca, i32* addrspace(3)* undef + ret void +} + +; Materialize into an add of a constant offset from the FI. +; FIXME: Should be able to merge adds + +; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: s_sub_u32 s7, s6, s4 +; GCN-NEXT: s_lshr_b32 s7, s7, 6 +; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s7, 4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_add_constant_to_fi_i32() #0 { + %alloca = alloca [2 x i32], align 4 + %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1 + store volatile i32* %gep0, i32* addrspace(3)* undef + ret void +} + +; A user the materialized frame index can't be meaningfully folded +; into. + +; GCN-LABEL: {{^}}func_other_fi_user_i32: +; GCN: s_sub_u32 vcc_hi, s6, s4 +; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 +; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN-NEXT: v_mul_lo_i32 v0, v0, 9 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_other_fi_user_i32() #0 { + %alloca = alloca [2 x i32], align 4 + %ptrtoint = ptrtoint [2 x i32]* %alloca to i32 + %mul = mul i32 %ptrtoint, 9 + store volatile i32 %mul, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: +; GCN: v_mov_b32_e32 v1, 15{{$}} +; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}} +define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 { + store volatile i32 15, i32* %ptr + ret void +} + +; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: +; GCN: s_waitcnt +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}} +define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 { + %val = load volatile i32, i32* %ptr + ret void +} + +attributes #0 = { nounwind }