Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -79,6 +79,7 @@ DS_READ_WRITE, S_BUFFER_LOAD_IMM, BUFFER_LOAD_OFFEN, + BUFFER_LOAD_OFFSET, }; struct CombineInfo { @@ -112,7 +113,7 @@ MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); - MachineBasicBlock::iterator mergeBufferLoadOffenPair(CombineInfo &CI); + MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); public: static char ID; @@ -232,7 +233,8 @@ // SMEM offsets must be consecutive. if (CI.InstClass == S_BUFFER_LOAD_IMM || - CI.InstClass == BUFFER_LOAD_OFFEN) { + CI.InstClass == BUFFER_LOAD_OFFEN || + CI.InstClass == BUFFER_LOAD_OFFSET) { unsigned Diff = CI.IsX2 ? 2 : 1; return (EltOffset0 + Diff == EltOffset1 || EltOffset1 + Diff == EltOffset0) && @@ -299,6 +301,10 @@ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; break; + case BUFFER_LOAD_OFFSET: + AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; + break; default: llvm_unreachable("invalid InstClass"); } @@ -399,7 +405,7 @@ } else { CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); - if (CI.InstClass == BUFFER_LOAD_OFFEN) { + if (CI.InstClass != S_BUFFER_LOAD_IMM) { CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); } @@ -615,21 +621,31 @@ return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadOffenPair( +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + unsigned Opcode; + + if (CI.InstClass == BUFFER_LOAD_OFFEN) { + Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : + AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + } else { + Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : + AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; + } const TargetRegisterClass *SuperRC = CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + + if (CI.InstClass == BUFFER_LOAD_OFFEN) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(MergedOffset) // offset .addImm(CI.GLC0) // glc @@ -724,13 +740,21 @@ continue; } if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) { - CI.InstClass = BUFFER_LOAD_OFFEN; + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { + if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) + CI.InstClass = BUFFER_LOAD_OFFEN; + else + CI.InstClass = BUFFER_LOAD_OFFSET; + CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadOffenPair(CI); + I = mergeBufferLoadPair(CI); if (!CI.IsX2) CreatedX2++; } else { Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -193,6 +193,40 @@ ret void } +;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: +;CHECK-NEXT: BB# +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) + %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: +;CHECK-NEXT: BB# +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 Index: llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll +++ llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll @@ -236,8 +236,7 @@ } ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v +; GCN: buffer_load_dwordx2 v ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { @@ -378,10 +377,7 @@ ; should catch this? ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v +; GCN: buffer_load_dwordx4 v ; GCN: s_barrier ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v