Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -137,7 +137,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override { return "SI Load / Store Optimizer"; } + StringRef getPassName() const override { return "SI Load Store Optimizer"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -150,10 +150,10 @@ } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) char SILoadStoreOptimizer::ID = 0; @@ -496,13 +496,19 @@ unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; + AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e64; BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) + .addReg(CarryReg, RegState::Define) + .addReg(ImmReg) .addReg(AddrReg->getReg()); } @@ -556,7 +562,7 @@ // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); @@ -579,17 +585,24 @@ const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = Addr->getReg(); + unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(CI.BaseOff); + unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; + AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e64; BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + .addReg(CarryReg, RegState::Define) + .addReg(ImmReg) + .addReg(AddrReg->getReg()); } MachineInstrBuilder Write2 = Index: test/CodeGen/AMDGPU/merge-load-store-vreg.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -0,0 +1,53 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s + +# If there's a base offset, Check that SILoadStoreOptimizer creates +# V_ADD_{I|U}32_e64 for that offset; _e64 uses a vreg for the carry (rather than +# %vcc, which is used in _e32); this ensures that %vcc is not inadvertently +# clobbered. + +# CHECK-LABEL: name: kernel +# CHECK: V_ADD_{{[IU]+}}32_e64 %7, %0, +# CHECK-NEXT: DS_WRITE2_B32 killed %6, %0, %3, 0, 8, +# CHECK: V_ADD_{{[IU]}}32_e64 %10, %3, +# CHECK-NEXT: DS_READ2_B32 killed %12, 0, 8, + +--- | + @0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4 + + define amdgpu_kernel void @kernel() { + bb.0: + br label %bb2 + + bb1: + ret void + + bb2: + %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + br label %bb1 + } +--- +name: kernel +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM + + bb.2: + %1:sreg_64_xexec = V_CMP_NE_U32_e64 %0, 0, implicit %exec + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit %exec + V_CMP_NE_U32_e32 1, %2, implicit-def %vcc, implicit %exec + DS_WRITE_B32 %0, %0, 1024, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp) + %3:vgpr_32 = V_MOV_B32_e32 0, implicit %exec + DS_WRITE_B32 %0, %3, 1056, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp1) + %4:vgpr_32 = DS_READ_B32 %3, 1088, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp2) + %5:vgpr_32 = DS_READ_B32 %3, 1120, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp3) + %vcc = S_AND_B64 %exec, %vcc, implicit-def %scc + S_CBRANCH_VCCNZ %bb.1, implicit %vcc + S_BRANCH %bb.1 +...