Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -80,6 +80,8 @@ S_BUFFER_LOAD_IMM, BUFFER_LOAD_OFFEN, BUFFER_LOAD_OFFSET, + BUFFER_STORE_OFFEN, + BUFFER_STORE_OFFSET, }; struct CombineInfo { @@ -114,6 +116,9 @@ MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); + unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, + bool &IsOffen) const; + MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); public: static char ID; @@ -231,10 +236,8 @@ CI.UseST64 = false; CI.BaseOff = 0; - // SMEM offsets must be consecutive. - if (CI.InstClass == S_BUFFER_LOAD_IMM || - CI.InstClass == BUFFER_LOAD_OFFEN || - CI.InstClass == BUFFER_LOAD_OFFSET) { + // Handle SMEM and VMEM instructions. + if (CI.InstClass != DS_READ_WRITE) { unsigned Diff = CI.IsX2 ? 2 : 1; return (EltOffset0 + Diff == EltOffset1 || EltOffset1 + Diff == EltOffset0) && @@ -297,11 +300,13 @@ AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; break; case BUFFER_LOAD_OFFEN: + case BUFFER_STORE_OFFEN: AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; break; case BUFFER_LOAD_OFFSET: + case BUFFER_STORE_OFFSET: AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; break; @@ -680,6 +685,90 @@ return Next; } +unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( + const MachineInstr &I, bool &IsX2, bool &IsOffen) const { + IsX2 = false; + IsOffen = false; + + switch (I.getOpcode()) { + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + IsX2 = true; + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: + IsX2 = true; + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + IsX2 = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: + IsX2 = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; + } + return 0; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + bool Unused1, Unused2; + unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + unsigned SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) + .addReg(SrcReg, RegState::Kill); + + if (CI.InstClass == BUFFER_STORE_OFFEN) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + moveInstsAfter(MIB, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. @@ -763,6 +852,22 @@ continue; } + bool StoreIsX2, IsOffen; + if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { + CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + CI.EltSize = 4; + CI.IsX2 = StoreIsX2; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeBufferStorePair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } + ++I; } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -95,6 +95,81 @@ ret void } +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1) + call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -237,8 +237,7 @@ ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: ; GCN: buffer_load_dwordx2 v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx2 v define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -379,10 +378,7 @@ ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: ; GCN: buffer_load_dwordx4 v ; GCN: s_barrier -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx4 v define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2