Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -137,7 +137,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override { return "SI Load / Store Optimizer"; } + StringRef getPassName() const override { return "SI Load Store Optimizer"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -150,10 +150,10 @@ } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) char SILoadStoreOptimizer::ID = 0; @@ -496,13 +496,15 @@ unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; - BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) + TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) + .addReg(ImmReg) .addReg(AddrReg->getReg()); } @@ -556,7 +558,7 @@ // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); @@ -579,17 +581,19 @@ const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = Addr->getReg(); + unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; - BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) + .addReg(ImmReg) + .addReg(AddrReg->getReg()); } MachineInstrBuilder Write2 = Index: test/CodeGen/AMDGPU/ds-combine-large-stride.ll =================================================================== --- test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -5,9 +5,9 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -50,9 +50,9 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -132,8 +132,8 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] @@ -170,7 +170,7 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 @@ -211,8 +211,8 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] @@ -249,9 +249,9 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -285,9 +285,9 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -349,8 +349,8 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]] @@ -380,7 +380,7 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 @@ -412,8 +412,8 @@ ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] Index: test/CodeGen/AMDGPU/merge-load-store-vreg.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -0,0 +1,60 @@ +# run: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,VI %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s + +# If there's a base offset, Check that SILoadStoreOptimizer creates +# V_ADD_{I|U}32_e64 for that offset; _e64 uses a vreg for the carry (rather than +# %vcc, which is used in _e32); this ensures that %vcc is not inadvertently +# clobbered. + +# GCN-LABEL: name: kernel + +# VI: V_ADD_I32_e64 %6, %0, +# VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8, +# VI: V_ADD_I32_e64 %10, %3, +# VI-NEXT: DS_READ2_B32 killed %11, 0, 8, + +# GFX9: V_ADD_U32_e64 %6, %0, +# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0, %3, 0, 8, +# GFX9: V_ADD_U32_e64 %9, %3, +# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8, + +--- | + @0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4 + + define amdgpu_kernel void @kernel() { + bb.0: + br label %bb2 + + bb1: + ret void + + bb2: + %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + br label %bb1 + } +--- +name: kernel +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM + + bb.2: + %1:sreg_64_xexec = V_CMP_NE_U32_e64 %0, 0, implicit %exec + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit %exec + V_CMP_NE_U32_e32 1, %2, implicit-def %vcc, implicit %exec + DS_WRITE_B32 %0, %0, 1024, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp) + %3:vgpr_32 = V_MOV_B32_e32 0, implicit %exec + DS_WRITE_B32 %0, %3, 1056, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp1) + %4:vgpr_32 = DS_READ_B32 %3, 1088, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp2) + %5:vgpr_32 = DS_READ_B32 %3, 1120, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp3) + %vcc = S_AND_B64 %exec, %vcc, implicit-def %scc + S_CBRANCH_VCCNZ %bb.1, implicit %vcc + S_BRANCH %bb.1 +...