diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -130,6 +130,7 @@ int AddrIdx[5]; const MachineOperand *AddrReg[5]; unsigned NumAddresses; + unsigned Order; bool hasSameBaseAddress(const MachineInstr &MI) { for (unsigned i = 0; i < NumAddresses; i++) { @@ -210,7 +211,7 @@ const SIInstrInfo &TII, const CombineInfo &Paired); static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI, - CombineInfo &Paired); + CombineInfo &Paired, bool Modify = false); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI, const CombineInfo &Paired); static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); @@ -259,8 +260,6 @@ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - void removeCombinedInst(std::list &MergeList, - const MachineInstr &MI); bool optimizeInstsWithSameBaseAddr(std::list &MergeList, bool &OptimizeListAgain); bool optimizeBlock(std::list > &MergeableInsts); @@ -720,7 +719,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI, - CombineInfo &Paired) { + CombineInfo &Paired, + bool Modify) { assert(CI.InstClass != MIMG); // XXX - Would the same offset be OK? Is there any reason this would happen or @@ -769,20 +769,25 @@ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); } + // Handle SMEM and VMEM instructions. // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { - CI.Offset = EltOffset0 / 64; - Paired.Offset = EltOffset1 / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset = EltOffset0 / 64; + Paired.Offset = EltOffset1 / 64; + CI.UseST64 = true; + } return true; } // Check if the new offsets fit in the reduced 8-bit range. if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { - CI.Offset = EltOffset0; - Paired.Offset = EltOffset1; + if (Modify) { + CI.Offset = EltOffset0; + Paired.Offset = EltOffset1; + } return true; } @@ -791,15 +796,19 @@ CI.BaseOff = std::min(CI.Offset, Paired.Offset); if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { - CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; - Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; - CI.UseST64 = true; + if (Modify) { + CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; + Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; + CI.UseST64 = true; + } return true; } if (isUInt<8>(OffsetDiff)) { - CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; - Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + if (Modify) { + CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; + Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; + } return true; } @@ -824,11 +833,19 @@ } } + +/// This function assumes that CI comes before Paired in a basic block. bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI, CombineInfo &Paired) { - MachineBasicBlock *MBB = CI.I->getParent(); - MachineBasicBlock::iterator E = MBB->end(); - MachineBasicBlock::iterator MBBI = CI.I; + + // Check both offsets (or masks for MIMG) can be combined and fit in the + // reduced range. + if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) + return false; + + if (CI.InstClass != MIMG && + (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STI, Paired))) + return false; const unsigned Opc = CI.I->getOpcode(); const InstClassEnum InstClass = getInstClass(Opc, *TII); @@ -844,12 +861,12 @@ if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) return false; - ++MBBI; - DenseSet RegDefsToMove; DenseSet PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); + MachineBasicBlock::iterator E = std::next(Paired.I); + MachineBasicBlock::iterator MBBI = std::next(CI.I); for (; MBBI != E; ++MBBI) { if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || @@ -904,23 +921,22 @@ CI.InstsToMove)) continue; - bool Match = CI.hasSameBaseAddress(*MBBI); - - if (Match) { - Paired.setMI(MBBI, *TII, *STM); - - // Check both offsets (or masks for MIMG) can be combined and fit in the - // reduced range. - bool canBeCombined = - CI.InstClass == MIMG - ? dmasksCanBeCombined(CI, *TII, Paired) - : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired); - - // We also need to go through the list of instructions that we plan to + if (&*MBBI == &*Paired.I) { + // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) { + + + // Call offsetsCanBeCombined with modify = true so that the offsets are + // correct for the new instruction. This should return true, because + // this function should only be called on CombineInfo objects that + // have already been confirmed to be mergeable. + if (CI.InstClass != MIMG) + offsetsCanBeCombined(CI, *STI, Paired, true); return true; + } + return false; } // We've found a load/store that we couldn't merge for some reason. @@ -1905,6 +1921,7 @@ SmallPtrSet AnchorList; // Sort potential mergeable instructions into lists. One list per base address. + unsigned Order = 0; for (MachineInstr &MI : MBB.instrs()) { // We run this before checking if an address is mergeable, because it can produce // better code even if the instructions aren't mergeable. @@ -1921,12 +1938,43 @@ CombineInfo CI; CI.setMI(MI, *TII, *STM); + CI.Order = Order++; if (!CI.hasMergeableAddress(*MRI)) continue; addInstToMergeableList(CI, MergeableInsts); } + + // At this point we have lists of Mergeable instructions. + // + // Part 2: Sort lists by offset and then for each CombineInfo object in the + // list try to find an instruction that can be merged with I. If an instruction + // is found, it is stored in the Paired field. If no instructions are found, then + // the CombineInfo object is deleted from the list. + + for (std::list>::iterator I = MergeableInsts.begin(), + E = MergeableInsts.end(); I != E;) { + + std::list &MergeList = *I; + if (MergeList.size() <= 1) { + // This means we have found only one instruction with a given address + // that can be merged, and we need at least 2 instructions to do a merge, + // so this list can be discarded. + I = MergeableInsts.erase(I); + continue; + } + + // Sort the lists by offsets, this way mergeable instructions will be + // adjacent to each other in the list, which will make it easier to find + // matches. + MergeList.sort( + [] (const CombineInfo &A, CombineInfo &B) { + return A.Offset < B.Offset; + }); + ++I; + } + return Modified; } @@ -1937,40 +1985,34 @@ std::list > &MergeableInsts) { bool Modified = false; - for (std::list &MergeList : MergeableInsts) { - if (MergeList.size() < 2) + for (std::list>::iterator I = MergeableInsts.begin(), + E = MergeableInsts.end(); I != E;) { + std::list &MergeList = *I; + if (MergeList.empty()) { + I = MergeableInsts.erase(I); continue; + } bool OptimizeListAgain = false; if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { - // We weren't able to make any changes, so clear the list so we don't + // We weren't able to make any changes, so delete the list so we don't // process the same instructions the next time we try to optimize this // block. - MergeList.clear(); + I = MergeableInsts.erase(I); continue; } - // We made changes, but also determined that there were no more optimization - // opportunities, so we don't need to reprocess the list - if (!OptimizeListAgain) - MergeList.clear(); - - OptimizeAgain |= OptimizeListAgain; Modified = true; - } - return Modified; -} - -void -SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, - const MachineInstr &MI) { - for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { - if (&*CI->I == &MI) { - MergeList.erase(CI); - return; + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) { + I = MergeableInsts.erase(I); + continue; } + OptimizeAgain = true; } + return Modified; } bool @@ -1978,18 +2020,24 @@ std::list &MergeList, bool &OptimizeListAgain) { bool Modified = false; - for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { - CombineInfo &CI = *I; - CombineInfo Paired; + for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); + Next = std::next(I)) { - if (CI.InstClass == UNKNOWN) - continue; + auto First = I; + auto Second = Next; + + if ((*First).Order > (*Second).Order) + std::swap(First, Second); + CombineInfo &CI = *First; + CombineInfo &Paired = *Second; - if (!findMatchingInst(CI, Paired)) - goto done; + if (!findMatchingInst(CI, Paired)) { + ++I; + CI.InstsToMove.clear(); + continue; + } Modified = true; - removeCombinedInst(MergeList, *Paired.I); switch (CI.InstClass) { default: @@ -2042,12 +2090,11 @@ break; } } + CI.Order = Paired.Order; + if (I == Second) + I = Next; -done: - // Clear the InstsToMove after we have finished searching so we don't have - // stale values left over if we search for this CI again in another pass - // over the block. - CI.InstsToMove.clear(); + MergeList.erase(Second); } return Modified; diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -6,9 +6,9 @@ ; offset0 is larger than offset1 ; SI-LABEL: {{^}}offset_order: -; SI-DAG: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}} +; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}} ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12 define amdgpu_kernel void @offset_order(float addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir --- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir @@ -66,6 +66,7 @@ attributes #0 = { convergent nounwind } define amdgpu_kernel void @merge_mmos(i32 addrspace(1)* %ptr_addr1) { ret void } + define amdgpu_kernel void @reorder_offsets(i32 addrspace(1)* %reorder_addr1) { ret void } ... --- @@ -194,3 +195,26 @@ S_ENDPGM 0 ... +--- +# CHECK-LABEL: reorder_offsets +# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1) +# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1) + +name: reorder_offsets +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1) + S_ENDPGM 0 + + +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir --- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir @@ -133,17 +133,17 @@ # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx9_tbuffer_load_float_32 body: | @@ -170,17 +170,17 @@ # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx9_tbuffer_load_sint_32 body: | @@ -207,17 +207,17 @@ # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx9_tbuffer_load_uint_32 body: | @@ -492,11 +492,11 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_float32 body: | @@ -546,11 +546,11 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_sint32 body: | @@ -600,11 +600,11 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_uint32 body: | @@ -911,17 +911,17 @@ # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx10_tbuffer_load_float_32 body: | @@ -948,17 +948,17 @@ # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx10_tbuffer_load_sint_32 body: | @@ -985,17 +985,17 @@ # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vreg_64 = COPY %18.sub0_sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %18.sub2_sub3 +# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 +# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %16.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %17.sub0 -# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub1 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 +# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1 name: gfx10_tbuffer_load_uint_32 body: | @@ -1271,11 +1271,11 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_float32 body: | @@ -1325,11 +1325,11 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_sint32 body: | @@ -1379,11 +1379,11 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 # GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, killed %16, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %18, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %17, %subreg.sub0_sub1, %0, %subreg.sub2 +# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 # GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_uint32 body: |