Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -514,6 +514,7 @@ DebugLoc DL = CI.I->getDebugLoc(); unsigned BaseReg = AddrReg->getReg(); + unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -525,15 +526,16 @@ TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg()); + .addReg(AddrReg->getReg(), 0, BaseSubReg); + BaseSubReg = 0; } MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); (void)Read2; @@ -601,6 +603,7 @@ DebugLoc DL = CI.I->getDebugLoc(); unsigned BaseReg = AddrReg->getReg(); + unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -612,17 +615,18 @@ TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) .addReg(ImmReg) - .addReg(AddrReg->getReg()); + .addReg(AddrReg->getReg(), 0, BaseSubReg); + BaseSubReg = 0; } MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(Write2, CI.InstsToMove); Index: llvm/trunk/test/CodeGen/AMDGPU/merge-load-store-vreg.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-load-store-vreg.mir +++ llvm/trunk/test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -6,7 +6,7 @@ # $vcc, which is used in _e32); this ensures that $vcc is not inadvertently # clobbered. -# GCN-LABEL: name: kernel +# GCN-LABEL: name: ds_combine_base_offset{{$}} # VI: V_ADD_I32_e64 %6, %0, # VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8, @@ -21,7 +21,37 @@ --- | @0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4 - define amdgpu_kernel void @kernel() { + define amdgpu_kernel void @ds_combine_base_offset() { + bb.0: + br label %bb2 + + bb1: + ret void + + bb2: + %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + br label %bb1 + } + + define amdgpu_kernel void @ds_combine_base_offset_subreg() { + bb.0: + br label %bb2 + + bb1: + ret void + + bb2: + %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + br label %bb1 + } + + define amdgpu_kernel void @ds_combine_subreg() { bb.0: br label %bb2 @@ -36,7 +66,7 @@ br label %bb1 } --- -name: kernel +name: ds_combine_base_offset body: | bb.0: %0:vgpr_32 = IMPLICIT_DEF @@ -58,3 +88,69 @@ S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.1 ... + +# GCN-LABEL: name: ds_combine_base_offset_subreg{{$}} + +# VI: V_ADD_I32_e64 %6, %0.sub0, +# VI-NEXT: DS_WRITE2_B32 killed %7, %0.sub0, %3.sub0, 0, 8, +# VI: V_ADD_I32_e64 %10, %3.sub0, +# VI-NEXT: DS_READ2_B32 killed %11, 0, 8, + +# GFX9: V_ADD_U32_e64 %6, %0.sub0, +# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0.sub0, %3.sub0, 0, 8, +# GFX9: V_ADD_U32_e64 %9, %3.sub0, +# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8, +--- +name: ds_combine_base_offset_subreg +body: | + bb.0: + %0:vreg_64 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM + + bb.2: + %1:sreg_64_xexec = V_CMP_NE_U32_e64 %0.sub0, 0, implicit $exec + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit $exec + V_CMP_NE_U32_e32 1, %2, implicit-def $vcc, implicit $exec + DS_WRITE_B32 %0.sub0, %0.sub0, 1024, 0, implicit $m0, implicit $exec :: (store 4 into %ir.tmp) + undef %3.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + DS_WRITE_B32 %0.sub0, %3.sub0, 1056, 0, implicit $m0, implicit $exec :: (store 4 into %ir.tmp1) + %4:vgpr_32 = DS_READ_B32 %3.sub0, 1088, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp2) + %5:vgpr_32 = DS_READ_B32 %3.sub0, 1120, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp3) + $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.1 +... + +# GCN-LABEL: name: ds_combine_subreg{{$}} + +# VI: DS_WRITE2_B32 %0.sub0, %0.sub0, %3.sub0, 0, 8, +# VI: DS_READ2_B32 %3.sub0, 0, 8, + +# GFX9: DS_WRITE2_B32_gfx9 %0.sub0, %0.sub0, %3.sub0, 0, 8, +# GFX9: DS_READ2_B32_gfx9 %3.sub0, 0, 8, +--- +name: ds_combine_subreg +body: | + bb.0: + %0:vreg_64 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM + + bb.2: + %1:sreg_64_xexec = V_CMP_NE_U32_e64 %0.sub0, 0, implicit $exec + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit $exec + V_CMP_NE_U32_e32 1, %2, implicit-def $vcc, implicit $exec + DS_WRITE_B32 %0.sub0, %0.sub0, 0, 0, implicit $m0, implicit $exec :: (store 4 into %ir.tmp) + undef %3.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec + DS_WRITE_B32 %0.sub0, %3.sub0, 32, 0, implicit $m0, implicit $exec :: (store 4 into %ir.tmp1) + %4:vgpr_32 = DS_READ_B32 %3.sub0, 0, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp2) + %5:vgpr_32 = DS_READ_B32 %3.sub0, 32, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp3) + $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc + S_CBRANCH_VCCNZ %bb.1, implicit $vcc + S_BRANCH %bb.1 +...