Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -173,10 +173,18 @@ } } -static void addDefsToList(const MachineInstr &MI, DenseSet &Defs) { - // XXX: Should this be looking for implicit defs? - for (const MachineOperand &Def : MI.defs()) - Defs.insert(Def.getReg()); +static void addDefsUsesToList(const MachineInstr &MI, + DenseSet &RegDefs, + DenseSet &PhysRegUses) { + for (const MachineOperand &Op : MI.operands()) { + if (Op.isReg()) { + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + else if (Op.readsReg() && + TargetRegisterInfo::isPhysicalRegister(Op.getReg())) + PhysRegUses.insert(Op.getReg()); + } + } } static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, @@ -194,16 +202,24 @@ // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, - DenseSet &Defs, + DenseSet &RegDefs, + DenseSet &PhysRegUses, SmallVectorImpl &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move // this instruction after the merged instructions. - - if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) { + // + // Similarly, if there is a def which is read by an instruction that is to + // be moved for merging, then we need to move the def-instruction as well. + // This can only happen for physical registers such as M0; virtual + // registers are in SSA form. + if (Use.isReg() && + ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && + PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); - addDefsToList(MI, Defs); + addDefsUsesToList(MI, RegDefs, PhysRegUses); return true; } } @@ -227,16 +243,6 @@ return true; } -static bool -hasPhysRegDef(MachineInstr &MI) { - for (const MachineOperand &Def : MI.defs()) { - if (Def.isReg() && - TargetRegisterInfo::isPhysicalRegister(Def.getReg())) - return true; - } - return false; -} - bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? @@ -342,8 +348,9 @@ ++MBBI; - DenseSet DefsToMove; - addDefsToList(*CI.I, DefsToMove); + DenseSet RegDefsToMove; + DenseSet PhysRegUsesToMove; + addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for ( ; MBBI != E; ++MBBI) { if (MBBI->getOpcode() != CI.I->getOpcode()) { @@ -359,13 +366,6 @@ return false; } - if (hasPhysRegDef(*MBBI)) { - // We could re-order this instruction in theory, but it would require - // tracking physreg defs and uses. This should only affect M0 in - // practice. - return false; - } - if (MBBI->mayLoadOrStore() && (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { @@ -373,14 +373,15 @@ // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. CI.InstsToMove.push_back(&*MBBI); - addDefsToList(*MBBI, DefsToMove); + addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); continue; } // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); + addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove); continue; } @@ -394,7 +395,8 @@ // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) + if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove)) continue; bool Match = true; @@ -453,8 +455,7 @@ // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) || - hasPhysRegDef(*MBBI)) + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) break; } return false; Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICI -check-prefix=SIVIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI -check-prefix=SIVIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=VIGFX9 -check-prefix=SIVIGFX9 %s ; SMRD load with an immediate offset. @@ -242,15 +242,24 @@ ret void } -; GCN-LABEL: {{^}}smrd_imm_nomerge_m0: +; GCN-LABEL: {{^}}smrd_imm_merge_m0: ; -; In principle we could merge the loads here as well, but it would require -; careful tracking of physical registers since both v_interp* and v_movrel* -; instructions (or gpr idx mode) use M0. +; SICIVI: s_buffer_load_dwordx2 +; SICIVI: s_mov_b32 m0 +; SICIVI_DAG: v_interp_p1_f32 +; SICIVI_DAG: v_interp_p1_f32 +; SICIVI_DAG: v_interp_p1_f32 +; SICIVI_DAG: v_interp_p2_f32 +; SICIVI_DAG: v_interp_p2_f32 +; SICIVI_DAG: v_interp_p2_f32 +; SICIVI: s_mov_b32 m0 +; SICIVI: v_movrels_b32_e32 ; -; GCN: s_buffer_load_dword -; GCN: s_buffer_load_dword -define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { +; Merging is still thwarted on GFX9 due to s_set_gpr_idx +; +; GFX9: s_buffer_load_dword +; GFX9: s_buffer_load_dword +define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 { main_body: %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0) %idx1 = bitcast float %idx1.f to i32