diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -172,9 +172,23 @@ const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold) { - return OpToFold.isFI() && - TII->isMUBUF(UseMI) && - OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); + if (!OpToFold.isFI()) + return false; + + if (TII->isMUBUF(UseMI)) + return OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), + AMDGPU::OpName::vaddr); + if (!TII->isFLATScratch(UseMI)) + return false; + + int SIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), + AMDGPU::OpName::saddr); + if (OpNo == SIdx) + return true; + + int VIdx = AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), + AMDGPU::OpName::vaddr); + return OpNo == VIdx && SIdx == -1; } FunctionPass *llvm::createSIFoldOperandsPass() { @@ -631,25 +645,36 @@ // Sanity check that this is a stack access. // FIXME: Should probably use stack pseudos before frame lowering. - if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != - MFI->getScratchRSrcReg()) - return; + if (TII->isMUBUF(*UseMI)) { + if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != + MFI->getScratchRSrcReg()) + return; - // Ensure this is either relative to the current frame or the current wave. - MachineOperand &SOff = - *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); - if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) && - (!SOff.isImm() || SOff.getImm() != 0)) - return; + // Ensure this is either relative to the current frame or the current + // wave. + MachineOperand &SOff = + *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) && + (!SOff.isImm() || SOff.getImm() != 0)) + return; + + // If this is relative to the current wave, update it to be relative to + // the current frame. + if (SOff.isImm()) + SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false); + } // A frame index will resolve to a positive constant, so it should always be // safe to fold the addressing mode, even pre-GFX9. UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); - // If this is relative to the current wave, update it to be relative to the - // current frame. - if (SOff.isImm()) - SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false); + if (TII->isFLATScratch(*UseMI) && + AMDGPU::getNamedOperandIdx(UseMI->getOpcode(), + AMDGPU::OpName::vaddr) != -1) { + unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(UseMI->getOpcode()); + UseMI->setDesc(TII->get(NewOpc)); + } + return; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1184,6 +1184,9 @@ LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode); + LLVM_READONLY + int getFlatScratchInstSSfromSV(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2524,6 +2524,13 @@ let ValueCols = [["ST"]]; } +def getFlatScratchInstSSfromSV : InstrMapping { + let FilterClass = "FlatScratchInst"; + let RowFields = ["SVOp"]; + let ColFields = ["Mode"]; + let KeyCol = ["SV"]; + let ValueCols = [["SS"]]; +} include "SIInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1498,6 +1498,10 @@ int64_t Offset = FrameInfo.getObjectOffset(Index); if (ST.enableFlatScratch()) { if (TII->isFLATScratch(*MI)) { + assert((int16_t)FIOperandNum == + AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::saddr)); + // The offset is always swizzled, just replace it if (FrameReg) FIOp.ChangeToRegister(FrameReg, false); diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir @@ -0,0 +1,88 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-enable-flat-scratch -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: test_fold_fi_scratch_load_vgpr +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + S_ENDPGM 0 + +... + +--- +name: test_fold_fi_scratch_load_sgpr +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_fold_fi_scratch_load_sgpr + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0 + ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0 + %0:sgpr_32 = S_MOV_B32 %stack.0 + %1:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %0:sgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + S_ENDPGM 0 + +... + +--- +name: test_fold_fi_scratch_store_vgpr +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_fold_fi_scratch_store_vgpr + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = IMPLICIT_DEF + SCRATCH_STORE_DWORD %1:vgpr_32, %0:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + S_ENDPGM 0 + +... + +--- +name: test_no_fold_fi_scratch_store_vgpr +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_no_fold_fi_scratch_store_vgpr + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: SCRATCH_STORE_DWORD [[V_MOV_B32_e32_]], [[DEF]], 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = IMPLICIT_DEF + SCRATCH_STORE_DWORD %0:vgpr_32, %1:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + S_ENDPGM 0 + +... + +--- +name: test_fold_fi_scratch_store_sgpr +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_fold_fi_scratch_store_sgpr + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0 + %0:sgpr_32 = S_MOV_B32 %stack.0 + %1:vgpr_32 = IMPLICIT_DEF + SCRATCH_STORE_DWORD_SADDR %1:vgpr_32, %0:sgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -166,7 +166,7 @@ ; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]] ; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} -; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, [[SP]], off offset:4{{$}} +; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4{{$}} ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 {