Index: llvm/lib/CodeGen/InlineSpiller.cpp =================================================================== --- llvm/lib/CodeGen/InlineSpiller.cpp +++ llvm/lib/CodeGen/InlineSpiller.cpp @@ -495,6 +495,31 @@ return true; } +/// Check if all subranges in \p LI and \p SLI have the same value number at \p +/// Idx. +static bool allSubRangeValNoSame(const LiveInterval &LI, + const LiveInterval &SLI, + const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, SlotIndex Idx) { + for (auto &SR : SLI.subranges()) { + VNInfo *SubVNI = SR.getVNInfoAt(Idx); + + for (auto &SubLI : LI.subranges()) { + if (SubLI.LaneMask == SR.LaneMask) { + if (SubVNI != SubLI.getVNInfoAt(Idx)) + return false; + } else if ((SubLI.LaneMask & SR.LaneMask).any()) { + // TODO: Check non-exact, overlapping subranges if they share the same + // def instruction + return false; + } + } + } + + return true; +} + /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any /// redundant spills of this value in SLI.reg and sibling copies. void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) { @@ -524,7 +549,13 @@ if (!MI.mayStore() && !TII.isCopyInstr(MI)) continue; SlotIndex Idx = LIS.getInstructionIndex(MI); - if (LI->getVNInfoAt(Idx) != VNI) + + // The main range value numbers will differ if multiple instructions are + // used to define its various subregisters. Check the subregister value + // numbers as a fallback. + if (LI->getVNInfoAt(Idx) != VNI && + (!SLI.hasSubRanges() || + !allSubRangeValNoSame(*LI, SLI, MI, MRI, TRI, Idx))) continue; // Follow sibling copies down the dominator tree. Index: llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir +++ llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir @@ -47,7 +47,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F + ; GCN-NEXT: liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr57 = COPY $vgpr9, implicit $exec ; GCN-NEXT: renamable $vgpr56 = COPY $vgpr8, implicit $exec @@ -62,17 +62,15 @@ ; GCN-NEXT: renamable $sgpr16_sgpr17 = IMPLICIT_DEF ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31 - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.2 + 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.1 + 4, addrspace 5) ; GCN-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0 - ; GCN-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1 + 4, addrspace 5) + ; GCN-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2, addrspace 5) + ; GCN-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2 + 4, addrspace 5) ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) - ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2, addrspace 5) - ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2 + 4, addrspace 5) + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1 + 4, addrspace 5) ; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: Index: llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10273,197 +10273,183 @@ ; GFX6-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s8, 0 -; GFX6-NEXT: v_writelane_b32 v4, s9, 1 -; GFX6-NEXT: v_writelane_b32 v4, s10, 2 -; GFX6-NEXT: v_writelane_b32 v4, s11, 3 -; GFX6-NEXT: v_writelane_b32 v4, s12, 4 -; GFX6-NEXT: v_writelane_b32 v4, s13, 5 -; GFX6-NEXT: v_writelane_b32 v4, s14, 6 -; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v0, s8, 0 +; GFX6-NEXT: v_writelane_b32 v0, s9, 1 +; GFX6-NEXT: v_writelane_b32 v0, s10, 2 +; GFX6-NEXT: v_writelane_b32 v0, s11, 3 +; GFX6-NEXT: v_writelane_b32 v0, s12, 4 +; GFX6-NEXT: v_writelane_b32 v0, s13, 5 +; GFX6-NEXT: v_writelane_b32 v0, s14, 6 +; GFX6-NEXT: v_writelane_b32 v0, s15, 7 +; GFX6-NEXT: s_mov_b32 s36, 0x84400 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x83c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s36, 0x83c00 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s8, v4, 0 -; GFX6-NEXT: v_readlane_b32 s9, v4, 1 -; GFX6-NEXT: v_readlane_b32 s10, v4, 2 -; GFX6-NEXT: v_readlane_b32 s11, v4, 3 -; GFX6-NEXT: v_readlane_b32 s12, v4, 4 -; GFX6-NEXT: v_readlane_b32 s13, v4, 5 -; GFX6-NEXT: v_readlane_b32 s14, v4, 6 -; GFX6-NEXT: v_readlane_b32 s15, v4, 7 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s8, v0, 0 +; GFX6-NEXT: v_readlane_b32 s9, v0, 1 +; GFX6-NEXT: v_readlane_b32 s10, v0, 2 +; GFX6-NEXT: v_readlane_b32 s11, v0, 3 +; GFX6-NEXT: v_readlane_b32 s12, v0, 4 +; GFX6-NEXT: v_readlane_b32 s13, v0, 5 +; GFX6-NEXT: v_readlane_b32 s14, v0, 6 +; GFX6-NEXT: v_readlane_b32 s15, v0, 7 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s16, 0 -; GFX6-NEXT: v_writelane_b32 v4, s17, 1 -; GFX6-NEXT: v_writelane_b32 v4, s18, 2 -; GFX6-NEXT: v_writelane_b32 v4, s19, 3 -; GFX6-NEXT: v_writelane_b32 v4, s20, 4 -; GFX6-NEXT: v_writelane_b32 v4, s21, 5 -; GFX6-NEXT: v_writelane_b32 v4, s22, 6 -; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v0, s16, 0 +; GFX6-NEXT: v_writelane_b32 v0, s17, 1 +; GFX6-NEXT: v_writelane_b32 v0, s18, 2 +; GFX6-NEXT: v_writelane_b32 v0, s19, 3 +; GFX6-NEXT: v_writelane_b32 v0, s20, 4 +; GFX6-NEXT: v_writelane_b32 v0, s21, 5 +; GFX6-NEXT: v_writelane_b32 v0, s22, 6 +; GFX6-NEXT: v_writelane_b32 v0, s23, 7 +; GFX6-NEXT: s_mov_b32 s36, 0x84c00 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s36, 0x84400 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s16, v4, 0 -; GFX6-NEXT: v_readlane_b32 s17, v4, 1 -; GFX6-NEXT: v_readlane_b32 s18, v4, 2 -; GFX6-NEXT: v_readlane_b32 s19, v4, 3 -; GFX6-NEXT: v_readlane_b32 s20, v4, 4 -; GFX6-NEXT: v_readlane_b32 s21, v4, 5 -; GFX6-NEXT: v_readlane_b32 s22, v4, 6 -; GFX6-NEXT: v_readlane_b32 s23, v4, 7 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s16, v0, 0 +; GFX6-NEXT: v_readlane_b32 s17, v0, 1 +; GFX6-NEXT: v_readlane_b32 s18, v0, 2 +; GFX6-NEXT: v_readlane_b32 s19, v0, 3 +; GFX6-NEXT: v_readlane_b32 s20, v0, 4 +; GFX6-NEXT: v_readlane_b32 s21, v0, 5 +; GFX6-NEXT: v_readlane_b32 s22, v0, 6 +; GFX6-NEXT: v_readlane_b32 s23, v0, 7 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s24, 0 -; GFX6-NEXT: v_writelane_b32 v4, s25, 1 -; GFX6-NEXT: v_writelane_b32 v4, s26, 2 -; GFX6-NEXT: v_writelane_b32 v4, s27, 3 -; GFX6-NEXT: v_writelane_b32 v4, s28, 4 -; GFX6-NEXT: v_writelane_b32 v4, s29, 5 -; GFX6-NEXT: v_writelane_b32 v4, s30, 6 -; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x85400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v0, s24, 0 +; GFX6-NEXT: v_writelane_b32 v0, s25, 1 +; GFX6-NEXT: v_writelane_b32 v0, s26, 2 +; GFX6-NEXT: v_writelane_b32 v0, s27, 3 +; GFX6-NEXT: v_writelane_b32 v0, s28, 4 +; GFX6-NEXT: v_writelane_b32 v0, s29, 5 +; GFX6-NEXT: v_writelane_b32 v0, s30, 6 +; GFX6-NEXT: v_writelane_b32 v0, s31, 7 +; GFX6-NEXT: s_mov_b32 s36, 0x85400 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s36, 0x84c00 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s24, v4, 0 -; GFX6-NEXT: v_readlane_b32 s25, v4, 1 -; GFX6-NEXT: v_readlane_b32 s26, v4, 2 -; GFX6-NEXT: v_readlane_b32 s27, v4, 3 -; GFX6-NEXT: v_readlane_b32 s28, v4, 4 -; GFX6-NEXT: v_readlane_b32 s29, v4, 5 -; GFX6-NEXT: v_readlane_b32 s30, v4, 6 -; GFX6-NEXT: v_readlane_b32 s31, v4, 7 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s24, v0, 0 +; GFX6-NEXT: v_readlane_b32 s25, v0, 1 +; GFX6-NEXT: v_readlane_b32 s26, v0, 2 +; GFX6-NEXT: v_readlane_b32 s27, v0, 3 +; GFX6-NEXT: v_readlane_b32 s28, v0, 4 +; GFX6-NEXT: v_readlane_b32 s29, v0, 5 +; GFX6-NEXT: v_readlane_b32 s30, v0, 6 +; GFX6-NEXT: v_readlane_b32 s31, v0, 7 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] -; GFX6-NEXT: s_mov_b64 s[36:37], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s0, 0 -; GFX6-NEXT: v_writelane_b32 v4, s1, 1 -; GFX6-NEXT: v_writelane_b32 v4, s2, 2 -; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s38, 0x85c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s4, 0 -; GFX6-NEXT: v_writelane_b32 v4, s5, 1 -; GFX6-NEXT: v_writelane_b32 v4, s6, 2 -; GFX6-NEXT: v_writelane_b32 v4, s7, 3 -; GFX6-NEXT: s_mov_b32 s36, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v0, s4, 0 +; GFX6-NEXT: v_writelane_b32 v0, s5, 1 +; GFX6-NEXT: v_writelane_b32 v0, s6, 2 +; GFX6-NEXT: v_writelane_b32 v0, s7, 3 +; GFX6-NEXT: s_mov_b32 s36, 0x85c00 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s2, 0 -; GFX6-NEXT: v_writelane_b32 v4, s3, 1 -; GFX6-NEXT: s_mov_b32 s4, 0x86400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v0, s2, 0 +; GFX6-NEXT: v_writelane_b32 v0, s3, 1 +; GFX6-NEXT: s_mov_b32 s4, 0x86000 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s4 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: s_mov_b32 s38, 0x85400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 -; GFX6-NEXT: v_readlane_b32 s4, v4, 4 -; GFX6-NEXT: v_readlane_b32 s5, v4, 5 -; GFX6-NEXT: v_readlane_b32 s6, v4, 6 -; GFX6-NEXT: v_readlane_b32 s7, v4, 7 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s0, v0, 0 +; GFX6-NEXT: v_readlane_b32 s1, v0, 1 +; GFX6-NEXT: v_readlane_b32 s2, v0, 2 +; GFX6-NEXT: v_readlane_b32 s3, v0, 3 +; GFX6-NEXT: v_readlane_b32 s4, v0, 4 +; GFX6-NEXT: v_readlane_b32 s5, v0, 5 +; GFX6-NEXT: v_readlane_b32 s6, v0, 6 +; GFX6-NEXT: v_readlane_b32 s7, v0, 7 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x2170 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s36, v4, 0 -; GFX6-NEXT: v_readlane_b32 s37, v4, 1 -; GFX6-NEXT: v_readlane_b32 s38, v4, 2 -; GFX6-NEXT: v_readlane_b32 s39, v4, 3 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s36, v0, 0 +; GFX6-NEXT: v_readlane_b32 s37, v0, 1 +; GFX6-NEXT: v_readlane_b32 s38, v0, 2 +; GFX6-NEXT: v_readlane_b32 s39, v0, 3 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: s_mov_b64 vcc, s[34:35] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x2180 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s34, v4, 0 -; GFX6-NEXT: v_readlane_b32 s35, v4, 1 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s34, v0, 0 +; GFX6-NEXT: v_readlane_b32 s35, v0, 1 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: ;;#ASMSTART @@ -10472,39 +10458,21 @@ ; GFX6-NEXT: s_mov_b64 s[34:35], vcc ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x85c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s6, 0x86200 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s6 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s0, v0, 0 +; GFX6-NEXT: v_readlane_b32 s1, v0, 1 +; GFX6-NEXT: v_readlane_b32 s2, v0, 2 +; GFX6-NEXT: v_readlane_b32 s3, v0, 3 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84400 -; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84c00 -; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b32 s2, 0x84c00 ; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10667,19 +10635,19 @@ ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:240 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 1 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:192 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:160 @@ -10688,7 +10656,7 @@ ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:112 @@ -10749,26 +10717,24 @@ ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v16 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_nop 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v19 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v3 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, v0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND @@ -10784,18 +10750,18 @@ ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s37 ; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s36, v4 ; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:240 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:224 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:208 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:192 @@ -10805,7 +10771,7 @@ ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:176 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:160 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload @@ -10900,10 +10866,6 @@ ; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 -; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] -; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v63 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v58 @@ -10913,7 +10875,6 @@ ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v61 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v60 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v35 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v39 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v34 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v33 @@ -10962,6 +10923,9 @@ ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v30 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v29 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART +; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] +; GFX10-FLATSCR-NEXT: ;;#ASMEND +; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v33 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v53 @@ -10993,6 +10957,7 @@ ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND +; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67 Index: llvm/test/CodeGen/AMDGPU/swdev380865.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -16,90 +16,63 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 ; CHECK-NEXT: ; kill: killed $sgpr0_sgpr1 -; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 -; CHECK-NEXT: s_mov_b32 s5, 0x40280000 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v2, s2, 0 -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s1, 0x40180000 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v2, s2, 0 ; CHECK-NEXT: v_writelane_b32 v2, s0, 1 ; CHECK-NEXT: v_writelane_b32 v2, s1, 2 -; CHECK-NEXT: s_mov_b32 s1, 0x40220000 +; CHECK-NEXT: s_mov_b32 s1, 0x40240000 ; CHECK-NEXT: v_writelane_b32 v2, s0, 3 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_writelane_b32 v2, s1, 4 -; CHECK-NEXT: s_mov_b32 s1, 0x40240000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 5 -; CHECK-NEXT: v_writelane_b32 v2, s1, 6 -; CHECK-NEXT: s_mov_b32 s1, 0x40260000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_mov_b32 s3, 0x40260000 +; CHECK-NEXT: s_mov_b32 s5, 0x40280000 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 -; CHECK-NEXT: s_mov_b32 s2, 0 -; CHECK-NEXT: s_mov_b32 s3, 0x40140000 -; CHECK-NEXT: v_writelane_b32 v2, s6, 9 -; CHECK-NEXT: v_writelane_b32 v2, s7, 10 -; CHECK-NEXT: v_writelane_b32 v2, s0, 11 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40140000 +; CHECK-NEXT: v_writelane_b32 v2, s0, 5 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] ; CHECK-NEXT: v_readlane_b32 s6, v2, 1 ; CHECK-NEXT: v_readlane_b32 s7, v2, 2 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: s_mov_b32 s1, s7 -; CHECK-NEXT: s_mov_b32 s0, s2 -; CHECK-NEXT: v_writelane_b32 v2, s6, 1 -; CHECK-NEXT: v_writelane_b32 v2, s7, 2 -; CHECK-NEXT: v_readlane_b32 s6, v2, 9 -; CHECK-NEXT: v_readlane_b32 s7, v2, 10 -; CHECK-NEXT: s_mov_b32 s6, s2 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40140000 +; CHECK-NEXT: s_mov_b32 s0, s6 +; CHECK-NEXT: v_readlane_b32 s6, v2, 6 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1] -; CHECK-NEXT: v_readlane_b32 s0, v2, 3 -; CHECK-NEXT: v_readlane_b32 s1, v2, 4 -; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: v_readlane_b32 s7, v2, 7 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_mov_b32 s6, s0 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] -; CHECK-NEXT: v_writelane_b32 v2, s0, 3 -; CHECK-NEXT: v_writelane_b32 v2, s1, 4 -; CHECK-NEXT: v_readlane_b32 s0, v2, 5 -; CHECK-NEXT: v_readlane_b32 s1, v2, 6 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s3, s1 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_writelane_b32 v2, s0, 5 -; CHECK-NEXT: v_writelane_b32 v2, s1, 6 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: v_readlane_b32 s0, v2, 7 -; CHECK-NEXT: v_readlane_b32 s1, v2, 8 -; CHECK-NEXT: s_mov_b32 s3, s1 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 -; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: v_readlane_b32 s6, v2, 8 +; CHECK-NEXT: v_readlane_b32 s7, v2, 9 +; CHECK-NEXT: s_mov_b32 s6, s0 +; CHECK-NEXT: v_readlane_b32 s0, v2, 3 +; CHECK-NEXT: v_readlane_b32 s1, v2, 4 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, 0x40140000 +; CHECK-NEXT: s_mov_b32 s0, s6 +; CHECK-NEXT: s_mov_b32 s2, s6 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1] ; CHECK-NEXT: v_readlane_b32 s0, v2, 0 -; CHECK-NEXT: v_readlane_b32 s2, v2, 11 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; CHECK-NEXT: v_readlane_b32 s2, v2, 5 ; CHECK-NEXT: s_add_i32 s2, s2, s0 -; CHECK-NEXT: v_writelane_b32 v2, s2, 11 -; CHECK-NEXT: v_readlane_b32 s0, v2, 11 +; CHECK-NEXT: v_writelane_b32 v2, s2, 5 +; CHECK-NEXT: v_readlane_b32 s0, v2, 5 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit ; CHECK-NEXT: v_mov_b32_e32 v3, 0 Index: llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir =================================================================== --- llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir +++ llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir @@ -153,8 +153,8 @@ %13 = S2_asl_r_p_acc %13, %47, %8.isub_lo %51 = A2_tfrpi 0 - ; CHECK: $d2 = S2_extractup undef renamable $d0, 6, 25 - ; CHECK: $d0 = A2_tfrpi 2 + ; CHECK: $d0 = S2_extractup undef renamable $d0, 6, 25 + ; CHECK: $d1 = A2_tfrpi 2 ; CHECK: $d13 = A2_tfrpi -1 ; CHECK-NOT: undef $r4 Index: llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -1024,10 +1024,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { ; CHECK-LABEL: DCT_mve7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #72 @@ -1074,7 +1072,6 @@ ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: mov r12, r7 ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill @@ -1083,20 +1080,16 @@ ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: adds r6, r3, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 -; CHECK-NEXT: add.w r11, r10, r5 +; CHECK-NEXT: add.w r11, r6, r5 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q5, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r10] -; CHECK-NEXT: add.w r6, r11, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q6, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 @@ -1178,8 +1171,7 @@ ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #72 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r11, pc} entry: %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 %i = load i32, ptr %NumInputs, align 4 @@ -1354,7 +1346,6 @@ ; CHECK-NEXT: adds r1, r0, #1 ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov q2, q3 @@ -1367,46 +1358,43 @@ ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.32 r10 ; CHECK-NEXT: add.w r11, r3, r6 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 +; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 -; CHECK-NEXT: add.w r5, r11, r6 -; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q6, q1, q0 +; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: add.w r5, r11, r6 ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r5] ; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: adds r7, r5, r6 +; CHECK-NEXT: adds r5, r7, r6 +; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload -; CHECK-NEXT: adds r5, r7, r6 +; CHECK-NEXT: adds r7, r5, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: adds r7, r5, r6 ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: adds r5, r7, r6 -; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: add r5, r6 ; CHECK-NEXT: vpstt Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1077,63 +1077,58 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #128] -; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [r0, #160] -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vldrw.u32 q5, [r0, #144] -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #96] -; CHECK-NEXT: vmov.f32 s26, s6 ; CHECK-NEXT: vldrw.u32 q7, [r0, #112] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [r0, #96] +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vldrw.u32 q5, [r0, #144] ; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov.f32 s24, s9 ; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vmov.f32 s26, s6 ; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: vmov.f32 s24, s2 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vmov.f32 s27, s10 +; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vstrw.32 q6, [r1, #16] ; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s15, s5 ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s24, s2 +; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s1, s28 +; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s4, s16 -; CHECK-NEXT: vmov.f32 s5, s28 -; CHECK-NEXT: vmov.f32 s7, s17 -; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s2, s31 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s26, s11 ; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s15, s30 ; CHECK-NEXT: vstrw.32 q6, [r1, #32] ; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s30, s0 ; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: vmov.f32 s1, s11 ; CHECK-NEXT: vmov.f32 s2, s7 ; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s18, s10 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s28, s8 ; CHECK-NEXT: vmov.f32 s31, s9 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s12, s29 ; CHECK-NEXT: vmov.f32 s29, s4 ; CHECK-NEXT: vstrw.32 q3, [r1, #160] @@ -1148,14 +1143,14 @@ ; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: vmov.f32 s11, s2 ; CHECK-NEXT: vmov.f32 s22, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s7, s9 ; CHECK-NEXT: vstrw.32 q0, [r1, #128] ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s9, s21 ; CHECK-NEXT: vstrw.32 q1, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1, #144] -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s21, s27 ; CHECK-NEXT: vstrw.32 q2, [r1, #64] ; CHECK-NEXT: vstrw.32 q0, [r1, #176]