Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -87,10 +87,11 @@ const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; + const SIMachineFunctionInfo *MFI; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const; @@ -159,6 +160,17 @@ } } +// TODO: Add heuristic that the frame index might not fit in the addressing mode +// immediate offset to avoid materializing in loops. +static bool frameIndexMayFold(const SIInstrInfo *TII, + const MachineInstr &UseMI, + int OpNo, + const MachineOperand &OpToFold) { + return OpToFold.isFI() && + (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && + OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); +} + FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } @@ -290,7 +302,6 @@ MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || @@ -403,7 +414,7 @@ void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); @@ -453,10 +464,28 @@ return; } + if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { + // Sanity check that this is a stack access. + // FIXME: Should probably use stack pseudos before frame lowering. + MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && + SOff->getReg() != MFI->getStackPtrOffsetReg())) + return; + + if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != + MFI->getScratchRSrcReg()) + return; - bool FoldingImm = OpToFold.isImm(); + // A frame index will resolve to a positive constant, so it should always be + // safe to fold the addressing mode, even pre-GFX9. + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); + SOff->setReg(MFI->getStackPtrOffsetReg()); + return; + } - if (FoldingImm && UseMI->isCopy()) { + bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI(); + + if (FoldingImmLike && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? @@ -517,7 +546,7 @@ // %sgpr = V_READFIRSTLANE_B32 %vgpr // => // %sgpr = S_MOV_B32 imm - if (FoldingImm) { + if (FoldingImmLike) { if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), *OpToFold.getParent(), @@ -528,7 +557,10 @@ // FIXME: ChangeToImmediate should clear subreg UseMI->getOperand(1).setSubReg(0); - UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + if (OpToFold.isImm()) + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + else + UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; } @@ -560,7 +592,7 @@ return; } - if (!FoldingImm) { + if (!FoldingImmLike) { tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit @@ -904,6 +936,9 @@ // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, + CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; @@ -1170,8 +1205,7 @@ ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); - - const SIMachineFunctionInfo *MFI = MF.getInfo(); + MFI = MF.getInfo(); // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -496,6 +496,11 @@ return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); } + // FIXME: Make this more precise + static bool isFLATScratch(const MachineInstr &MI) { + return isSegmentSpecificFLAT(MI); + } + // Any FLAT encoded instruction, including global_* and scratch_*. bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -27,6 +27,47 @@ ret void } +; Make sure the offset is folded and function's frame register is used +; rather than the global scratch wave offset. +; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block: +; GCN-NOT: v_lshrrev_b32 +; GCN-NOT: s_sub_u32 + +; GCN: s_and_saveexec_b64 +; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]] + +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} +; GCN-NOT: s32 + +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} +; GCN-NOT: s32 + +; GCN: [[BB1]] +; GCN: s_or_b64 exec, exec +define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1, i1 %cond) #1 { +entry: + br i1 %cond, label %bb0, label %bb1 + +bb0: + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 + %add3 = add nsw i32 %tmp1, 2 + store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 + store volatile i32 9, i32 addrspace(1)* null, align 4 + br label %bb1 + +bb1: + ret void +} + ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: ; GCN: s_mov_b32 s5, s32 ; GCN: s_add_u32 s32, s32, 0xc00{{$}} Index: test/CodeGen/AMDGPU/fold-fi-mubuf.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: no_fold_fi_non_stack_rsrc_soffset +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + scratchWaveOffsetReg: '$sgpr6' + frameOffsetReg: '$sgpr6' + stackPtrOffsetReg: '$sgpr6' +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset + ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; GCN: SI_RETURN_TO_EPILOG $vgpr0 + %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + %1:sreg_32_xm0 = S_MOV_B32 0 + %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN_TO_EPILOG $vgpr0 + +... + +--- +name: no_fold_fi_non_stack_rsrc +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + scratchWaveOffsetReg: '$sgpr6' + frameOffsetReg: '$sgpr6' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc + ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; GCN: SI_RETURN_TO_EPILOG $vgpr0 + %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN_TO_EPILOG $vgpr0 + +... + +# Offset is from global scratch wave offset. +--- +name: fold_fi_mubuf_scratch_scratch_wave_offset +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + scratchWaveOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %2 + S_ENDPGM 0, implicit $vgpr0 + +... + +--- +name: no_fold_fi_mubuf_scratch_sp_offset +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + scratchWaveOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %2 + S_ENDPGM 0, implicit $vgpr0 + +... Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -144,9 +144,6 @@ ret void } -; FIXME: Should be able to see that this can use vaddr, but the -; FrameIndex is hidden behind a CopyFromReg in the second block. - ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 @@ -156,13 +153,13 @@ ; GCN: s_and_saveexec_b64 -; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]] -; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}} +; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]] +; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} -; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]] -; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}} +; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]] +; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}} -; GCN: ds_write_b32 +; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 { %cmp = icmp eq i32 %arg2, 0 br i1 %cmp, label %bb, label %ret Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -440,6 +440,32 @@ ret float %val } +; Make sure a frame index folding doessn't crash on a MUBUF not used +; for stack access. + +; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset: +; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} +; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen +define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { + %alloca = alloca i32, addrspace(5) + %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32 + + %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false) + ret float %ret.val +} + +; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset: +; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s +; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]] +define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) { + %alloca = alloca i32, addrspace(5) + %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32 + + %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false) + ret float %ret.val +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -60,8 +60,7 @@ ; Make sure this doesn't crash. ; CHECK-LABEL: {{^}}test_readfirstlane_fi: -; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4 -; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]] +; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 { %alloca = alloca i32, addrspace(5) %int = ptrtoint i32 addrspace(5)* %alloca to i32