diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -303,6 +303,8 @@ return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: return 4; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + return 8; case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; @@ -372,6 +374,7 @@ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -413,6 +416,7 @@ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; } } @@ -463,6 +467,7 @@ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -857,6 +862,7 @@ return false; case 2: case 4: + case 8: return true; } } @@ -1523,45 +1529,62 @@ return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; case 4: return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } case MIMG: - assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); + assert((countPopulation(CI.DMask | Paired.DMask) == Width) && + "No overlaps"); return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); } } std::pair -SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, + const CombineInfo &Paired) { - if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) - return std::make_pair(0, 0); + assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero"); bool ReverseOrder; if (CI.InstClass == MIMG) { - assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && - "No overlaps"); + assert( + (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && + "No overlaps"); ReverseOrder = CI.DMask > Paired.DMask; } else ReverseOrder = CI.Offset > Paired.Offset; - static const unsigned Idxs[4][4] = { - {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, - {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, - {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, - {AMDGPU::sub3, 0, 0, 0}, - }; unsigned Idx0; unsigned Idx1; - assert(CI.Width >= 1 && CI.Width <= 3); - assert(Paired.Width >= 1 && Paired.Width <= 3); + if (CI.Width + Paired.Width > 4) { + assert(CI.Width == 4 && Paired.Width == 4); - if (ReverseOrder) { - Idx1 = Idxs[0][Paired.Width - 1]; - Idx0 = Idxs[Paired.Width][CI.Width - 1]; + if (ReverseOrder) { + Idx1 = AMDGPU::sub0_sub1_sub2_sub3; + Idx0 = AMDGPU::sub4_sub5_sub6_sub7; + } else { + Idx0 = AMDGPU::sub0_sub1_sub2_sub3; + Idx1 = AMDGPU::sub4_sub5_sub6_sub7; + } } else { - Idx0 = Idxs[0][CI.Width - 1]; - Idx1 = Idxs[CI.Width][Paired.Width - 1]; + static const unsigned Idxs[4][4] = { + {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, + {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, + {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, + {AMDGPU::sub3, 0, 0, 0}, + }; + + assert(CI.Width >= 1 && CI.Width <= 3); + assert(Paired.Width >= 1 && Paired.Width <= 3); + + if (ReverseOrder) { + Idx1 = Idxs[0][Paired.Width - 1]; + Idx0 = Idxs[Paired.Width][CI.Width - 1]; + } else { + Idx0 = Idxs[0][CI.Width - 1]; + Idx1 = Idxs[CI.Width][Paired.Width - 1]; + } } return std::make_pair(Idx0, Idx1); @@ -2134,7 +2157,7 @@ MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove); CI.setMI(NewMI, *TII, *STM); - OptimizeListAgain |= (CI.Width + Paired.Width) < 16; + OptimizeListAgain |= (CI.Width + Paired.Width) < 8; break; } case BUFFER_LOAD: { diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir @@ -0,0 +1,133 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s + +# CHECK-LABEL: name: merge_s_buffer_load_x2 +# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4) + +name: merge_s_buffer_load_x2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x4 +# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4) +name: merge_s_buffer_load_x4 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) + %3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32)) + %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x8 +# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4) +name: merge_s_buffer_load_x8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) + %3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32)) + %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32)) + %5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + %6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32)) + %7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32)) + %8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x8_reordered +# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4) +name: merge_s_buffer_load_x8_reordered +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) + %3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32)) + %5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32)) + %6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + %7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32)) + %8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2 +# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 8) +name: merge_s_buffer_load_x8_out_of_x2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64)) + %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64)) + %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64)) + %4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... +--- + +# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4 +# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16) +name: merge_s_buffer_load_x8_out_of_x4 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) + %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128)) + + S_ENDPGM 0 +... +--- + + +# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed +# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16) +name: merge_s_buffer_load_x8_mixed +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64)) + %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +---