Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -761,6 +761,15 @@ CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; bool isBasicBlockPrologue(const MachineInstr &MI) const override; + + /// \brief Return a partially built integer add instruction without carry. + /// Caller must add source operands. + /// For pre-GFX9 it will generate unused carry destination operand. + /// TODO: After GFX9 it should return a no-carry operation. + MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, + unsigned DestReg) const; }; namespace AMDGPU { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3944,3 +3944,16 @@ return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); } + +MachineInstrBuilder +SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, + unsigned DestReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -385,17 +385,18 @@ BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) - .addImm(CI.BaseOff) - .addReg(AddrReg->getReg()); + .addImm(CI.BaseOff) + .addReg(AddrReg->getReg()); } - MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*CI.I->memoperands_begin()) - .addMemOperand(*CI.Paired->memoperands_begin()); + MachineInstrBuilder Read2 = + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + (void)Read2; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -457,19 +458,19 @@ BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + .addImm(CI.BaseOff) + .addReg(Addr->getReg()); } - MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*CI.I->memoperands_begin()) - .addMemOperand(*CI.Paired->memoperands_begin()); + MachineInstrBuilder Write2 = + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); moveInstsAfter(Write2, CI.InstsToMove); Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -278,7 +278,6 @@ } MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -288,8 +287,7 @@ BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg) .addFrameIndex(FrameIdx); - BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) - .addReg(UnusedCarry, RegState::Define | RegState::Dead) + TII->getAddNoCarry(*MBB, Ins, DL, BaseReg) .addReg(OffsetReg, RegState::Kill) .addReg(FIReg); } Index: test/CodeGen/AMDGPU/ds-combine-large-stride.ll =================================================================== --- test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -1,16 +1,20 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; CHECK-LABEL: ds_read32_combine_stride_400: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 -define void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +; GCN-LABEL: ds_read32_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 +define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = load float, float addrspace(3)* %arg, align 4 %tmp2 = fadd float %tmp, 0.000000e+00 @@ -39,17 +43,20 @@ ret void } -; CHECK-LABEL: ds_read32_combine_stride_400_back: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 -; CHECK-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 -define void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +; GCN-LABEL: ds_read32_combine_stride_400_back: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100 +; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100 +define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 %tmp2 = load float, float addrspace(3)* %tmp, align 4 @@ -78,14 +85,14 @@ ret void } -; CHECK-LABEL: ds_read32_combine_stride_8192: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160 -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224 -define void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +; GCN-LABEL: ds_read32_combine_stride_8192: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224 +define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = load float, float addrspace(3)* %arg, align 4 %tmp2 = fadd float %tmp, 0.000000e+00 @@ -114,16 +121,19 @@ ret void } -; CHECK-LABEL: ds_read32_combine_stride_8192_shifted: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32 -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32 -; CHECK-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32 -define void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { +; GCN-LABEL: ds_read32_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32 +; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32 +define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2 %tmp2 = load float, float addrspace(3)* %tmp, align 4 @@ -147,15 +157,16 @@ ret void } -; CHECK-LABEL: ds_read64_combine_stride_400: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] -; CHECK-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 -; CHECK-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150 -; CHECK-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250 -; CHECK-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50 -define void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { +; GCN-LABEL: ds_read64_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250 +; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50 +define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { bb: %tmp = load double, double addrspace(3)* %arg, align 8 %tmp2 = fadd double %tmp, 0.000000e+00 @@ -184,16 +195,19 @@ ret void } -; CHECK-LABEL: ds_read64_combine_stride_8192_shifted: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] -; CHECK-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16 -; CHECK-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16 -; CHECK-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16 -define void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { +; GCN-LABEL: ds_read64_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16 +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16 +; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16 +define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) { bb: %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 %tmp2 = load double, double addrspace(3)* %tmp, align 8 @@ -217,17 +231,20 @@ ret void } -; CHECK-LABEL: ds_write32_combine_stride_400: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; CHECK-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; CHECK-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; CHECK-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; CHECK-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -define void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) { +; GCN-LABEL: ds_write32_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) { bb: store float 1.000000e+00, float addrspace(3)* %arg, align 4 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100 @@ -247,17 +264,20 @@ ret void } -; CHECK-LABEL: ds_write32_combine_stride_400_back: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] -; CHECK-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; CHECK-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; CHECK-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -; CHECK-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 -define void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) { +; GCN-LABEL: ds_write32_combine_stride_400_back: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100 +define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700 store float 1.000000e+00, float addrspace(3)* %tmp, align 4 @@ -277,14 +297,14 @@ ret void } -; CHECK-LABEL: ds_write32_combine_stride_8192: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 -; CHECK-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 -; CHECK-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160 -; CHECK-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224 -define void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) { +; GCN-LABEL: ds_write32_combine_stride_8192: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160 +; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224 +define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) { bb: store float 1.000000e+00, float addrspace(3)* %arg, align 4 %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048 @@ -304,16 +324,19 @@ ret void } -; CHECK-LABEL: ds_write32_combine_stride_8192_shifted: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] -; CHECK-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 -; CHECK-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 -; CHECK-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 -define void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) { +; GCN-LABEL: ds_write32_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] +; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32 +define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) { bb: %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1 store float 1.000000e+00, float addrspace(3)* %tmp, align 4 @@ -330,15 +353,16 @@ ret void } -; CHECK-LABEL: ds_write64_combine_stride_400: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] -; CHECK-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 -; CHECK-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150 -; CHECK-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250 -; CHECK-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 -define void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) { +; GCN-LABEL: ds_write64_combine_stride_400: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 +; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150 +; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250 +; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 +define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) { bb: store double 1.000000e+00, double addrspace(3)* %arg, align 8 %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50 @@ -358,16 +382,19 @@ ret void } -; CHECK-LABEL: ds_write64_combine_stride_8192_shifted: -; CHECK: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 -; CHECK: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; CHECK-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; CHECK-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] -; CHECK-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 -; CHECK-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 -; CHECK-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 -define void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) { +; GCN-LABEL: ds_write64_combine_stride_8192_shifted: +; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 +; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] +; GCN-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GCN-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] +; GFX9-DAG: v_add_i32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16 +define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) { bb: %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1 store double 1.000000e+00, double addrspace(3)* %tmp, align 8