Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -173,10 +173,17 @@ } } -static void addDefsToList(const MachineInstr &MI, DenseSet &Defs) { - for (const MachineOperand &Def : MI.operands()) { - if (Def.isReg() && Def.isDef()) - Defs.insert(Def.getReg()); +static void addDefsUsesToList(const MachineInstr &MI, + DenseSet &RegDefs, + DenseSet &PhysRegUses) { + for (const MachineOperand &Op : MI.operands()) { + if (Op.isReg()) { + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + else if (Op.readsReg() && + TargetRegisterInfo::isPhysicalRegister(Op.getReg())) + PhysRegUses.insert(Op.getReg()); + } } } @@ -195,16 +202,24 @@ // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, - DenseSet &Defs, + DenseSet &RegDefs, + DenseSet &PhysRegUses, SmallVectorImpl &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move // this instruction after the merged instructions. - - if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) { + // + // Similarly, if there is a def which is read by an instruction that is to + // be moved for merging, then we need to move the def-instruction as well. + // This can only happen for physical registers such as M0; virtual + // registers are in SSA form. + if (Use.isReg() && + ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && + PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); - addDefsToList(MI, Defs); + addDefsUsesToList(MI, RegDefs, PhysRegUses); return true; } } @@ -228,16 +243,6 @@ return true; } -static bool -hasPhysRegDef(MachineInstr &MI) { - for (const MachineOperand &Def : MI.defs()) { - if (Def.isReg() && - TargetRegisterInfo::isPhysicalRegister(Def.getReg())) - return true; - } - return false; -} - bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? @@ -343,8 +348,9 @@ ++MBBI; - DenseSet DefsToMove; - addDefsToList(*CI.I, DefsToMove); + DenseSet RegDefsToMove; + DenseSet PhysRegUsesToMove; + addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for ( ; MBBI != E; ++MBBI) { if (MBBI->getOpcode() != CI.I->getOpcode()) { @@ -360,13 +366,6 @@ return false; } - if (hasPhysRegDef(*MBBI)) { - // We could re-order this instruction in theory, but it would require - // tracking physreg defs and uses. This should only affect M0 in - // practice. - return false; - } - if (MBBI->mayLoadOrStore() && (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { @@ -374,14 +373,15 @@ // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. CI.InstsToMove.push_back(&*MBBI); - addDefsToList(*MBBI, DefsToMove); + addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); continue; } // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); + addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove); continue; } @@ -395,7 +395,8 @@ // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) + if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove)) continue; bool Match = true; @@ -454,8 +455,7 @@ // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) || - hasPhysRegDef(*MBBI)) + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) break; } return false; Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -629,6 +629,27 @@ ret void } +; GCN-LABEL: {{^}}ds_read_interp_read: +; CI: s_mov_b32 m0, -1 +; CI: ds_read_b32 +; CI: s_mov_b32 m0, s0 +; CI: v_interp_mov_f32 +; CI: s_mov_b32 m0, -1 +; CI: ds_read_b32 +; GFX9: ds_read2_b32 v[0:1], v0 offset1:4 +; GFX9: s_mov_b32 m0, s0 +; GFX9: v_interp_mov_f32 +define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) { + %v0 = load float, float addrspace(3)* %inptr, align 4 + %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims) + %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4 + %v1 = load float, float addrspace(3)* %ptr1, align 4 + %v1b = fadd float %v1, %intrp + %r0 = insertelement <2 x float> undef, float %v0, i32 0 + %r1 = insertelement <2 x float> %r0, float %v1b, i32 1 + ret <2 x float> %r1 +} + declare void @void_func_void() #3 declare i32 @llvm.amdgcn.workgroup.id.x() #1 @@ -636,6 +657,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 declare i32 @llvm.amdgcn.workitem.id.y() #1 +declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone + declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/merge-load-store-physreg.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/merge-load-store-physreg.mir @@ -0,0 +1,116 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s + +# Check that SILoadStoreOptimizer honors physregs defs/uses between moved +# instructions. +# +# The following IR snippet would usually be optimized by the peephole optimizer. +# However, an equivalent situation can occur with buffer instructions as well. + +# CHECK-LABEL: name: scc_def_and_use_no_dependency +# CHECK: S_ADD_U32 +# CHECK: S_ADDC_U32 +# CHECK: DS_READ2_B32 +--- | + define amdgpu_kernel void @scc_def_and_use_no_dependency(i32 addrspace(3)* %ptr.0) nounwind { + %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1 + %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16 + ret void + } + + define amdgpu_kernel void @scc_def_and_use_dependency(i32 addrspace(3)* %ptr.0) nounwind { + %ptr.4 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 1 + %ptr.64 = getelementptr i32, i32 addrspace(3)* %ptr.0, i32 16 + ret void + } +... +--- +name: scc_def_and_use_no_dependency +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: false +liveins: + - { reg: '$vgpr0' } + - { reg: '$sgpr0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0: + liveins: $vgpr0, $sgpr0 + + %1:vgpr_32 = COPY $vgpr0 + %10:sgpr_32 = COPY $sgpr0 + + $m0 = S_MOV_B32 -1 + %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0) + + %11:sgpr_32 = S_ADD_U32 %10, 4, implicit-def $scc + %12:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc + + %3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64) + S_ENDPGM + +... + +# CHECK-LABEL: name: scc_def_and_use_dependency +# CHECK: DS_READ2_B32 +# CHECK: S_ADD_U32 +# CHECK: S_ADDC_U32 +--- +name: scc_def_and_use_dependency +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: false +liveins: + - { reg: '$vgpr0' } + - { reg: '$sgpr0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0: + liveins: $vgpr0, $sgpr0 + + %1:vgpr_32 = COPY $vgpr0 + %10:sgpr_32 = COPY $sgpr0 + + $m0 = S_MOV_B32 -1 + %2:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.0) + %20:sgpr_32 = V_READFIRSTLANE_B32 %2, implicit $exec + + %21:sgpr_32 = S_ADD_U32 %20, 4, implicit-def $scc + ; The S_ADDC_U32 depends on the first DS_READ_B32 only via SCC + %11:sgpr_32 = S_ADDC_U32 %10, 0, implicit-def dead $scc, implicit $scc + + %3:vgpr_32 = DS_READ_B32 %1, 64, 0, implicit $m0, implicit $exec :: (load 4 from %ir.ptr.64) + S_ENDPGM + +... Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s ; SMRD load with an immediate offset. @@ -232,15 +232,24 @@ ret void } -; GCN-LABEL: {{^}}smrd_imm_nomerge_m0: +; GCN-LABEL: {{^}}smrd_imm_merge_m0: ; -; In principle we could merge the loads here as well, but it would require -; careful tracking of physical registers since both v_interp* and v_movrel* -; instructions (or gpr idx mode) use M0. +; SICIVI: s_buffer_load_dwordx2 +; SICIVI: s_mov_b32 m0 +; SICIVI_DAG: v_interp_p1_f32 +; SICIVI_DAG: v_interp_p1_f32 +; SICIVI_DAG: v_interp_p1_f32 +; SICIVI_DAG: v_interp_p2_f32 +; SICIVI_DAG: v_interp_p2_f32 +; SICIVI_DAG: v_interp_p2_f32 +; SICIVI: s_mov_b32 m0 +; SICIVI: v_movrels_b32_e32 ; -; GCN: s_buffer_load_dword -; GCN: s_buffer_load_dword -define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { +; Merging is disabled on GFX9 due to a cache line straddling bug. +; +; GFX9: s_buffer_load_dword +; GFX9: s_buffer_load_dword +define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { main_body: %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0) %idx1 = bitcast float %idx1.f to i32