diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -375,6 +375,18 @@ .addImm(8); } +// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not +// memory. They should have been removed by now. +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I)) + return false; + } + + return true; +} + // Shift down registers reserved for the scratch RSRC. Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( MachineFunction &MF) const { @@ -389,7 +401,8 @@ Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg)) + if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) && + allStackObjectsAreDead(MF.getFrameInfo()))) return Register(); if (ST.hasSGPRInitBug() || @@ -1131,18 +1144,6 @@ } } -// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not -// memory. They should have been removed by now. -static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { - for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); - I != E; ++I) { - if (!MFI.isDeadObjectIndex(I)) - return false; - } - - return true; -} - #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFunction &MF) { const MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1379,18 +1379,12 @@ MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } - Register ScratchRSrc = - ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy - : MFI->getScratchRSrcReg(); BuildMI(MBB, MI, DL, OpDesc) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) - .addReg(ScratchRSrc, RegState::Implicit) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); - // Add the scratch resource registers as implicit uses because we may end up - // needing them, and need to ensure that the reserved registers are - // correctly handled. + if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); return; @@ -1400,13 +1394,9 @@ : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); - Register ScratchRSrc = - ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy - : MFI->getScratchRSrcReg(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr - .addReg(ScratchRSrc) // scratch_rsrc .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); @@ -1519,27 +1509,20 @@ MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } - Register ScratchRSrc = - ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy - : MFI->getScratchRSrcReg(); if (RI.spillSGPRToVGPR()) FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) - .addReg(ScratchRSrc, RegState::Implicit) .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); + return; } unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) : getVGPRSpillRestoreOpcode(SpillSize); - Register ScratchRSrc = - ST.enableFlatScratch() ? AMDGPU::TTMP0_TTMP1_TTMP2_TTMP3 // Dummy - : MFI->getScratchRSrcReg(); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr - .addReg(ScratchRSrc) // scratch_rsrc .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -644,7 +644,7 @@ SchedRW = [WriteVMEM] in { def _SAVE : VPseudoInstSI < (outs), - (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, + (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_32:$soffset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; @@ -656,8 +656,8 @@ def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata), - (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, - i32imm:$offset)> { + (ins i32imm:$vaddr, + SReg_32:$soffset, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -334,7 +334,6 @@ int Index, Register ValueReg, bool ValueIsKill, - MCRegister ScratchRsrcReg, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -748,7 +748,6 @@ int Index, Register ValueReg, bool IsKill, - MCRegister ScratchRsrcReg, MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO, @@ -888,7 +887,7 @@ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)); if (!IsFlat) - MIB.addReg(ScratchRsrcReg); + MIB.addReg(FuncInfo->getScratchRSrcReg()); if (SOffset == AMDGPU::NoRegister) { if (!IsFlat) @@ -1009,14 +1008,14 @@ buildSpillLoadStore(MI, Opc, Index, VGPR, false, - MFI->getScratchRSrcReg(), FrameReg, + FrameReg, Offset * EltSize, MMO, RS); } else { unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; buildSpillLoadStore(MI, Opc, Index, VGPR, - IsKill, MFI->getScratchRSrcReg(), FrameReg, + IsKill, FrameReg, Offset * EltSize, MMO, RS); // This only ever adds one VGPR spill MFI->addToSpilledVGPRs(1); @@ -1354,7 +1353,6 @@ buildSpillLoadStore(MI, Opc, Index, VData->getReg(), VData->isKill(), - TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), @@ -1390,7 +1388,6 @@ buildSpillLoadStore(MI, Opc, Index, VData->getReg(), VData->isKill(), - TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -342,7 +342,7 @@ ; HSA-LABEL: {{^}}store_flat_scratch: ; CI-DAG: s_mov_b32 flat_scratch_lo, s9 ; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 -; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 +; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 ; GFX9: s_add_u32 flat_scratch_lo, s6, s9 ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir b/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir --- a/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/fast-ra-kills-vcc.mir @@ -46,9 +46,9 @@ ; CHECK: liveins: $vgpr0 ; CHECK: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec ; CHECK: renamable $sgpr4_sgpr5 = COPY $vcc - ; CHECK: SI_SPILL_S64_SAVE $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) + ; CHECK: SI_SPILL_S64_SAVE $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) ; CHECK: renamable $sgpr4_sgpr5 = COPY $vcc - ; CHECK: $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5) + ; CHECK: $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5) ; CHECK: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, 3, killed $sgpr4_sgpr5, implicit $exec ; CHECK: S_ENDPGM 0, implicit killed $vgpr0, implicit killed renamable $vcc %0:vgpr_32 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir --- a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir +++ b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir @@ -13,10 +13,10 @@ ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) ; GCN: liveins: $vgpr0_vgpr1 - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec @@ -48,14 +48,14 @@ ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) ; GCN: liveins: $vgpr0_vgpr1 - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec - ; GCN: SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: @@ -90,12 +90,12 @@ ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) ; GCN: liveins: $vgpr0_vgpr1 - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: renamable $vgpr2 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec - ; GCN: SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: @@ -126,13 +126,13 @@ ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) ; GCN: liveins: $vgpr0_vgpr1 - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, implicit $exec ; GCN: renamable $vgpr0 = V_ADD_U32_e64 1, 1, 0, implicit $exec - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: ; GCN: S_ENDPGM 0 @@ -162,10 +162,10 @@ ; GCN: bb.0: ; GCN: successors: %bb.1(0x80000000) ; GCN: liveins: $vgpr0_vgpr1 - ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit-def dead $vgpr2_vgpr3 ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr1, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir --- a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir @@ -46,4 +46,4 @@ liveins: $sgpr4, $sgpr5, $sgpr9, $sgpr22, $vgpr0, $sgpr6_sgpr7 renamable $vgpr2 = IMPLICIT_DEF - SI_SPILL_V32_SAVE killed $vgpr2, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + SI_SPILL_V32_SAVE killed $vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -13,15 +13,19 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 @@ -52,18 +56,26 @@ ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX7-NEXT: s_ashr_i32 s4, s4, 28 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 +; GFX8-NEXT: s_add_u32 s20, s20, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 @@ -105,12 +117,18 @@ ; ; GFX9-LABEL: idot8_acc32: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 @@ -154,10 +172,16 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 @@ -169,9 +193,15 @@ ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -257,15 +287,21 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 +; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 @@ -323,12 +359,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 ; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 @@ -374,12 +416,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s16, s16, s3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 ; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 @@ -425,12 +473,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s18, -1 +; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 +; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 ; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 @@ -474,7 +528,13 @@ ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -592,15 +652,21 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 @@ -658,12 +724,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s20, s20, s3 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s3, s1, 12 @@ -712,12 +784,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s1, 12 @@ -766,12 +844,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 @@ -818,7 +902,13 @@ ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -939,15 +1029,19 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: v_mad_i32_i24 v1, s6, v0, v1 @@ -980,18 +1074,26 @@ ; GFX7-NEXT: s_ashr_i32 s4, s4, 28 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v2, v0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_multiuses_mul1: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 +; GFX8-NEXT: s_add_u32 s20, s20, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 @@ -1035,12 +1137,18 @@ ; ; GFX9-LABEL: idot8_multiuses_mul1: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 @@ -1084,12 +1192,18 @@ ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 @@ -1133,9 +1247,15 @@ ; ; GFX10-DL-LABEL: idot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1249,14 +1369,17 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i32 s6, s4, 28 ; GFX7-NEXT: s_ashr_i32 s13, s5, 28 ; GFX7-NEXT: s_bfe_i32 s14, s5, 0x40018 ; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40014 @@ -1265,6 +1388,7 @@ ; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008 ; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004 ; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40000 +; GFX7-NEXT: s_ashr_i32 s6, s4, 28 ; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40014 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40010 @@ -1288,18 +1412,26 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s14 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32_vecMul: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 +; GFX8-NEXT: s_add_u32 s20, s20, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s2, 28 ; GFX8-NEXT: s_ashr_i32 s11, s3, 28 @@ -1341,12 +1473,18 @@ ; ; GFX9-LABEL: idot8_acc32_vecMul: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 28 ; GFX9-NEXT: s_ashr_i32 s11, s3, 28 @@ -1390,10 +1528,16 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 @@ -1405,9 +1549,15 @@ ; ; GFX10-DL-LABEL: idot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1457,15 +1607,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 +; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_ashr_i32 s6, s4, 28 ; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018 @@ -1519,12 +1675,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40000 ; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40000 @@ -1567,8 +1729,14 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 @@ -1634,8 +1802,14 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 @@ -1699,7 +1873,13 @@ ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -1802,16 +1982,22 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff -; GFX7-NEXT: s_mov_b32 s9, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 +; GFX7-NEXT: s_movk_i32 s8, 0xff +; GFX7-NEXT: s_mov_b32 s9, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 ; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40000 @@ -1885,12 +2071,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s20, s20, s3 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 @@ -1955,12 +2147,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s1, 4 @@ -2043,12 +2241,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4 @@ -2129,7 +2333,13 @@ ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s22, -1 +; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s20, s20, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -11,12 +11,16 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s7, s6, 28 ; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 @@ -50,6 +54,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s14 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -58,10 +64,16 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 +; GFX8-NEXT: s_add_u32 s20, s20, s3 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -105,10 +117,16 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -152,10 +170,16 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 @@ -167,9 +191,15 @@ ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -255,14 +285,20 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s6, s4, 28 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 @@ -304,12 +340,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -352,12 +394,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s16, s16, s3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -400,12 +448,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s18, -1 +; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 +; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -446,7 +500,13 @@ ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s6, -1 +; GFX10-DL-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s4, s4, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -558,14 +618,20 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s6, s4, 28 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 @@ -607,12 +673,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -655,12 +727,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s16, s16, s3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -703,12 +781,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s18, -1 +; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 +; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -749,7 +833,13 @@ ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s6, -1 +; GFX10-DL-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s4, s4, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -861,14 +951,20 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s6, s4, 28 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 @@ -911,12 +1007,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: s_and_b32 s15, s1, 15 @@ -962,12 +1064,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s16, s16, s3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s8, s0, 15 ; GFX9-NEXT: s_and_b32 s15, s1, 15 @@ -1013,12 +1121,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s18, -1 +; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 +; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 ; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 @@ -1062,7 +1176,13 @@ ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -1161,14 +1281,20 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s6, s4, 28 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 @@ -1211,12 +1337,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: s_and_b32 s15, s1, 15 @@ -1262,12 +1394,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s16, s16, s3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s8, s0, 15 ; GFX9-NEXT: s_and_b32 s15, s1, 15 @@ -1313,12 +1451,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s18, -1 +; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 +; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 ; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 @@ -1362,7 +1506,13 @@ ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -1461,12 +1611,16 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 ; GFX7-NEXT: s_lshr_b32 s7, s6, 28 @@ -1502,6 +1656,8 @@ ; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1510,10 +1666,16 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 +; GFX8-NEXT: s_add_u32 s20, s20, s3 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 @@ -1559,10 +1721,16 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 @@ -1608,10 +1776,16 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 @@ -1655,9 +1829,15 @@ ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1770,12 +1950,16 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s26, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 s27, 0xe8f000 +; GFX7-NEXT: s_add_u32 s24, s24, s3 +; GFX7-NEXT: s_addc_u32 s25, s25, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s7, s6, 28 ; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 @@ -1809,6 +1993,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s14 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1817,10 +2003,16 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 +; GFX8-NEXT: s_add_u32 s20, s20, s3 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 ; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -1864,10 +2056,16 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 ; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 @@ -1911,10 +2109,16 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s10, -1 +; GFX9-DL-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 @@ -1926,9 +2130,15 @@ ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1979,14 +2189,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c ; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c @@ -2038,12 +2254,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -2086,9 +2308,15 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 @@ -2141,9 +2369,15 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 @@ -2194,7 +2428,13 @@ ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -2281,14 +2521,20 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s6, s4, 0x4000c ; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c @@ -2356,12 +2602,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s22, -1 +; GFX8-NEXT: s_mov_b32 s23, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s20, s20, s3 +; GFX8-NEXT: s_addc_u32 s21, s21, 0 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 @@ -2426,12 +2678,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010 @@ -2492,12 +2750,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 @@ -2556,7 +2820,13 @@ ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 @@ -2654,14 +2924,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s22, -1 +; GFX7-NEXT: s_mov_b32 s23, 0xe8f000 +; GFX7-NEXT: s_add_u32 s20, s20, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_addc_u32 s21, s21, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s6, s4, 28 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 @@ -2704,12 +2980,18 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_add_u32 s16, s16, s3 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: s_and_b32 s15, s1, 15 @@ -2755,12 +3037,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_add_u32 s16, s16, s3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s8, s0, 15 ; GFX9-NEXT: s_and_b32 s15, s1, 15 @@ -2806,12 +3094,18 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s18, -1 +; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 +; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 ; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 @@ -2855,7 +3149,13 @@ ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -12,7 +12,7 @@ ; GCN: bb.0.entry: ; GCN: successors: %bb.1(0x80000000) ; GCN: liveins: $vgpr0, $sgpr0_sgpr1 - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GCN: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 8 from %ir.out.kernarg.offset.cast, align 4, addrspace 4) ; GCN: renamable $sgpr6 = COPY renamable $sgpr1 ; GCN: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 @@ -22,7 +22,7 @@ ; GCN: renamable $sgpr1 = COPY killed renamable $sgpr6 ; GCN: renamable $sgpr2 = COPY killed renamable $sgpr5 ; GCN: renamable $sgpr3 = COPY killed renamable $sgpr4 - ; GCN: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 16 into %stack.2, align 4, addrspace 5) + ; GCN: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.2, align 4, addrspace 5) ; GCN: renamable $sgpr0 = S_MOV_B32 16 ; GCN: renamable $sgpr1 = S_MOV_B32 15 ; GCN: renamable $sgpr2 = S_MOV_B32 14 @@ -71,36 +71,36 @@ ; GCN: renamable $vgpr13 = COPY killed renamable $vgpr18 ; GCN: renamable $vgpr14 = COPY killed renamable $vgpr17 ; GCN: renamable $vgpr15 = COPY killed renamable $vgpr16 - ; GCN: SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 64 into %stack.1, align 4, addrspace 5) + ; GCN: SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store 64 into %stack.1, align 4, addrspace 5) ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec - ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) ; GCN: renamable $vgpr0 = IMPLICIT_DEF ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.4, align 4, addrspace 5) - ; GCN: $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.5, addrspace 5) - ; GCN: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 64 from %stack.1, align 4, addrspace 5) - ; GCN: $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.4, align 4, addrspace 5) + ; GCN: $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load 4 from %stack.5, addrspace 5) + ; GCN: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 64 from %stack.1, align 4, addrspace 5) + ; GCN: $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr16, implicit $exec ; GCN: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, $vgpr16, implicit $exec ; GCN: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit-def undef $mode, implicit $m0, implicit $mode ; GCN: renamable $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 - ; GCN: SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.6, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store 4 into %stack.6, addrspace 5) ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.5, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr32, 0, implicit $exec :: (store 4 into %stack.5, addrspace 5) ; GCN: renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1 - ; GCN: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.4, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.4, align 4, addrspace 5) ; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc ; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN: bb.3: ; GCN: successors: %bb.2(0x80000000) - ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5) + ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN: bb.2: - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5) - ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 16 from %stack.2, align 4, addrspace 5) + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5) + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.2, align 4, addrspace 5) ; GCN: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: diff --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir --- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -37,10 +37,10 @@ ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_BRANCH %bb.1 bb.1: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_ENDPGM 0, implicit $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -33,6 +33,7 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; CHECK: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc ; CHECK: $vgpr2 = COPY killed $sgpr33 @@ -43,6 +44,7 @@ ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 S_ENDPGM 0, implicit $vcc ... @@ -75,6 +77,7 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; CHECK: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc ; CHECK: $vgpr2 = COPY killed $sgpr29 @@ -83,6 +86,7 @@ ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31 S_ENDPGM 0, implicit $vcc ... @@ -115,6 +119,7 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; CHECK: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $sgpr28 = S_MOV_B32 8192 ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_CO_U32_e64 killed $sgpr28, killed $vgpr2, 0, implicit $exec @@ -123,6 +128,7 @@ ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 S_ENDPGM 0, implicit $vcc ... @@ -154,6 +160,7 @@ ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 + ; CHECK: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $vcc_lo = S_MOV_B32 8192 ; CHECK: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec @@ -162,6 +169,7 @@ ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 ; CHECK: S_ENDPGM 0 S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 + $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -29,6 +29,7 @@ ; MUBUF: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; MUBUF: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; MUBUF: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; MUBUF: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; MUBUF: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec ; MUBUF: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 @@ -42,6 +43,7 @@ ; FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc ; FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc ; FLATSCR: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + ; FLATSCR: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec ; FLATSCR: $sgpr33 = S_ADD_U32 $sgpr33, 8192, implicit-def $scc ; FLATSCR: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; FLATSCR: $sgpr33 = S_SUB_U32 $sgpr33, 8192, implicit-def $scc @@ -49,6 +51,7 @@ ; FLATSCR: $sgpr33 = frame-setup COPY $sgpr27 ; FLATSCR: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc + $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 S_ENDPGM 0, implicit $vcc ... diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -29,6 +29,7 @@ ; GFX8: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX8: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -46,6 +47,7 @@ ; GFX9: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX9: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -62,11 +64,13 @@ ; GFX9-FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc ; GFX9-FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc ; GFX9-FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc + ; GFX9-FLATSCR: $vgpr0 = V_MOV_B32_e32 $sgpr33, implicit $exec ; GFX9-FLATSCR: $vcc_hi = S_ADD_U32 $sgpr33, 8192, implicit-def $scc ; GFX9-FLATSCR: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec ; GFX9-FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc ; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs + $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec S_ENDPGM 0, csr_amdgpu_allvgprs ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -33,16 +33,16 @@ # SHARE: stack-id: sgpr-spill, callee-saved-register: '', callee-saved-restored: true, # SHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -# SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) -# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) -# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 -# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) -# SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 -# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) +# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit undef $vgpr0 +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $vgpr0 +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: stack: # NOSHARE: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, @@ -58,17 +58,17 @@ # NOSHARE: stack-id: sgpr-spill, callee-saved-register: '', callee-saved-restored: true, # NOSHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) -# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 -# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5) -# NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 -# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5) +# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) +# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit undef $vgpr0 +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5) +# NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $vgpr0 +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5) ... @@ -87,13 +87,13 @@ %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 - dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit undef $vgpr0 $sgpr32 = COPY %0 %4:sreg_32_xm0 = COPY $sgpr32 ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 $vgpr0 = COPY %2 - dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit killed $vgpr0 $sgpr32 = COPY %4 ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir --- a/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir @@ -20,7 +20,7 @@ ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... # Make sure there's no verifier error on the undef spill component when the value is killed. @@ -44,7 +44,7 @@ ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... --- @@ -66,5 +66,5 @@ ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir @@ -13,16 +13,16 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0 - ; SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0 - ; SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) - ; SPILLED: $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) + ; SPILLED: $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 ; EXPANDED-LABEL: name: spill_restore_agpr32 ; EXPANDED: bb.0: @@ -65,13 +65,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1 - ; SPILLED: SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1 ; EXPANDED-LABEL: name: spill_restore_agpr64 ; EXPANDED: bb.0: @@ -113,12 +113,12 @@ ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 ; SPILLED: S_NOP 0, implicit-def renamable $agpr0 - ; SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: bb.2: - ; SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; SPILLED: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SPILLED: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 ; SPILLED: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -207,13 +207,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 - ; SPILLED: SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 ; EXPANDED-LABEL: name: spill_restore_agpr96 ; EXPANDED: bb.0: @@ -256,13 +256,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 - ; SPILLED: SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 ; EXPANDED-LABEL: name: spill_restore_agpr128 ; EXPANDED: bb.0: @@ -307,13 +307,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 - ; SPILLED: SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 ; EXPANDED-LABEL: name: spill_restore_agpr160 ; EXPANDED: bb.0: @@ -360,13 +360,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; SPILLED: SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; EXPANDED-LABEL: name: spill_restore_agpr192 ; EXPANDED: bb.0: @@ -415,13 +415,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; SPILLED: SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; EXPANDED-LABEL: name: spill_restore_agpr256 ; EXPANDED: bb.0: @@ -474,13 +474,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; SPILLED: SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; EXPANDED-LABEL: name: spill_restore_agpr512 ; EXPANDED: bb.0: @@ -549,13 +549,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; SPILLED: SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5) + ; SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; EXPANDED-LABEL: name: spill_restore_agpr1024 ; EXPANDED: bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -8,12 +8,12 @@ # CHECK-LABEL: name: expecting_non_empty_interval # CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec -# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) +# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) # CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec # CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $mode, implicit $exec # CHECK: S_NOP 0, implicit %6.sub1 -# CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) # CHECK-NEXT: S_NOP 0, implicit %8.sub1 # CHECK-NEXT: S_NOP 0, implicit undef %9.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -29,7 +29,7 @@ ; GCN: renamable $sgpr8 = COPY killed renamable $sgpr1 ; GCN: S_ENDPGM 0, implicit $sgpr8 renamable $sgpr1 = COPY $sgpr2 - SI_SPILL_S128_SAVE renamable $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5) + SI_SPILL_S128_SAVE renamable $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5) renamable $sgpr8 = COPY killed renamable $sgpr1 S_ENDPGM 0, implicit $sgpr8 ... @@ -58,7 +58,7 @@ ; GCN: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: S_ENDPGM 0 renamable $sgpr1 = COPY $sgpr2 - SI_SPILL_S128_SAVE renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5) + SI_SPILL_S128_SAVE renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5) S_ENDPGM 0 ... @@ -86,7 +86,7 @@ ; GCN: renamable $vgpr8 = COPY killed renamable $vgpr1 ; GCN: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 - SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) renamable $vgpr8 = COPY killed renamable $vgpr1 S_ENDPGM 0, implicit $vgpr8 ... @@ -114,6 +114,6 @@ ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) ; GCN: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 - SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -17,13 +17,13 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; SPILLED: SI_SPILL_S192_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 24 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_S192_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, %stack.0, implicit $exec, implicit $sgpr32 :: (store 24 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 = SI_SPILL_S192_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 24 from %stack.0, align 4, addrspace 5) + ; SPILLED: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 = SI_SPILL_S192_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 24 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-LABEL: name: spill_restore_sgpr192 ; EXPANDED: bb.0: @@ -72,25 +72,25 @@ ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) ; SPILLED: S_NOP 0, implicit-def renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; SPILLED: SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) + ; SPILLED: SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) ; SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; SPILLED: bb.1: ; SPILLED: successors: %bb.2(0x80000000) ; SPILLED: S_NOP 1 ; SPILLED: bb.2: - ; SPILLED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) + ; SPILLED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) ; SPILLED: S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; EXPANDED-LABEL: name: spill_restore_vgpr192 ; EXPANDED: bb.0: ; EXPANDED: successors: %bb.1(0x80000000) ; EXPANDED: S_NOP 0, implicit-def renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; EXPANDED: SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) + ; EXPANDED: SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) ; EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED: bb.1: ; EXPANDED: successors: %bb.2(0x80000000) ; EXPANDED: S_NOP 1 ; EXPANDED: bb.2: - ; EXPANDED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) + ; EXPANDED: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) ; EXPANDED: S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 bb.0: S_NOP 0, implicit-def %0:vreg_192 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -266,9 +266,9 @@ ; RA: internal %15.sub13:sgpr_512 = COPY [[DEF2]].sub13 ; RA: internal %15.sub14:sgpr_512 = COPY [[DEF2]].sub14 ; RA: } - ; RA: SI_SPILL_S512_SAVE %15, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 64 into %stack.0, align 4, addrspace 5) + ; RA: SI_SPILL_S512_SAVE %15, %stack.0, implicit $exec, implicit $sgpr32 :: (store 64 into %stack.0, align 4, addrspace 5) ; RA: S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98 - ; RA: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 64 from %stack.0, align 4, addrspace 5) + ; RA: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 64 from %stack.0, align 4, addrspace 5) ; RA: undef %14.sub4_sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub4_sub5 { ; RA: internal %14.sub10_sub11:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10_sub11 ; RA: internal %14.sub7:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub7 @@ -295,9 +295,9 @@ ; VR: renamable $sgpr20 = S_MOV_B32 -1 ; VR: renamable $sgpr25 = S_MOV_B32 -1 ; VR: renamable $sgpr26 = S_MOV_B32 -1 - ; VR: SI_SPILL_S512_SAVE killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 64 into %stack.0, align 4, addrspace 5) + ; VR: SI_SPILL_S512_SAVE killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, %stack.0, implicit $exec, implicit $sgpr32 :: (store 64 into %stack.0, align 4, addrspace 5) ; VR: S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98 - ; VR: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 64 from %stack.0, align 4, addrspace 5) + ; VR: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 64 from %stack.0, align 4, addrspace 5) ; VR: renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr16_sgpr17 ; VR: renamable $sgpr15 = COPY killed renamable $sgpr19 ; VR: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr22_sgpr23 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir @@ -30,45 +30,45 @@ ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) ; CHECK: } ; CHECK: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) ; CHECK: undef %52.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %52, %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %52, %stack.1, $sgpr32, 0, implicit $exec :: (store 16 into %stack.1, align 4, addrspace 5) ; CHECK: undef %57.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %57, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.2, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %57, %stack.2, $sgpr32, 0, implicit $exec :: (store 16 into %stack.2, align 4, addrspace 5) ; CHECK: undef %62.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.3, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr32, 0, implicit $exec :: (store 16 into %stack.3, align 4, addrspace 5) ; CHECK: undef %67.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec ; CHECK: undef %71.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %71, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.4, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %71, %stack.4, $sgpr32, 0, implicit $exec :: (store 16 into %stack.4, align 4, addrspace 5) ; CHECK: undef %76.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %76, %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.5, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %76, %stack.5, $sgpr32, 0, implicit $exec :: (store 16 into %stack.5, align 4, addrspace 5) ; CHECK: undef %81.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %81, %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.6, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %81, %stack.6, $sgpr32, 0, implicit $exec :: (store 16 into %stack.6, align 4, addrspace 5) ; CHECK: undef %86.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec ; CHECK: undef %90.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %90, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.7, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %90, %stack.7, $sgpr32, 0, implicit $exec :: (store 16 into %stack.7, align 4, addrspace 5) ; CHECK: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.8, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr32, 0, implicit $exec :: (store 16 into %stack.8, align 4, addrspace 5) ; CHECK: undef %100.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %100, %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.9, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %100, %stack.9, $sgpr32, 0, implicit $exec :: (store 16 into %stack.9, align 4, addrspace 5) ; CHECK: undef %105.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec ; CHECK: undef %109.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec ; CHECK: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec ; CHECK: undef %117.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) ; CHECK: undef %122.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec ; CHECK: undef %126.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec ; CHECK: undef %130.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) ; CHECK: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) ; CHECK: undef %140.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec ; CHECK: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) ; CHECK: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) ; CHECK: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) ; CHECK: undef %158.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec @@ -81,67 +81,67 @@ ; CHECK: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec ; CHECK: undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; CHECK: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE]], %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE]], %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE1]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE1]], %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.1, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE1]], %stack.1, $sgpr32, 0, implicit $exec :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE2]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE2]], %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.2, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE2]], %stack.2, $sgpr32, 0, implicit $exec :: (store 16 into %stack.2, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE3]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.3, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr32, 0, implicit $exec :: (store 16 into %stack.3, align 4, addrspace 5) ; CHECK: undef %68.sub2:vreg_128 = COPY %67.sub2 ; CHECK: %68.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.4, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr32, 0, implicit $exec :: (store 16 into %stack.4, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.5, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr32, 0, implicit $exec :: (store 16 into %stack.5, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.6, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr32, 0, implicit $exec :: (store 16 into %stack.6, align 4, addrspace 5) ; CHECK: undef %87.sub2:vreg_128 = COPY %86.sub2 ; CHECK: %87.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.7, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr32, 0, implicit $exec :: (store 16 into %stack.7, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.8, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.8, $sgpr32, 0, implicit $exec :: (store 16 into %stack.8, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.9, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.9, $sgpr32, 0, implicit $exec :: (store 16 into %stack.9, align 4, addrspace 5) ; CHECK: undef %106.sub2:vreg_128 = COPY %105.sub2 ; CHECK: %106.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec ; CHECK: undef %110.sub2:vreg_128 = COPY %109.sub2 ; CHECK: %110.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec ; CHECK: undef %114.sub2:vreg_128 = COPY %113.sub2 ; CHECK: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.10, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) ; CHECK: undef %123.sub2:vreg_128 = COPY %122.sub2 ; CHECK: %123.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec ; CHECK: undef %127.sub2:vreg_128 = COPY %126.sub2 ; CHECK: %127.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.11, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.12, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) ; CHECK: undef %141.sub2:vreg_128 = COPY %140.sub2 ; CHECK: %141.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.13, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.14, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) ; CHECK: undef %155.sub2:vreg_128 = COPY %154.sub2 ; CHECK: %155.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec ; CHECK: undef %159.sub2:vreg_128 = COPY %158.sub2 @@ -186,14 +186,14 @@ ; CHECK: %153.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %153.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) ; CHECK: undef %148.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 { ; CHECK: internal %148.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2 ; CHECK: } ; CHECK: %148.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %148.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) ; CHECK: undef %143.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 { ; CHECK: internal %143.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2 ; CHECK: } @@ -206,14 +206,14 @@ ; CHECK: %139.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %139.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) ; CHECK: undef %134.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 { ; CHECK: internal %134.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2 ; CHECK: } ; CHECK: %134.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %134.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) ; CHECK: undef %129.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 { ; CHECK: internal %129.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2 ; CHECK: } @@ -232,7 +232,7 @@ ; CHECK: %121.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %121.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) ; CHECK: undef %116.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 { ; CHECK: internal %116.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2 ; CHECK: } @@ -257,21 +257,21 @@ ; CHECK: %104.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %104.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 { ; CHECK: internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2 ; CHECK: } ; CHECK: %99.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %99.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 { ; CHECK: internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2 ; CHECK: } ; CHECK: %94.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %94.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) ; CHECK: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 { ; CHECK: internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2 ; CHECK: } @@ -284,21 +284,21 @@ ; CHECK: %85.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %85.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) ; CHECK: undef %80.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 { ; CHECK: internal %80.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2 ; CHECK: } ; CHECK: %80.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %80.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) ; CHECK: undef %75.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 { ; CHECK: internal %75.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2 ; CHECK: } ; CHECK: %75.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %75.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) ; CHECK: undef %70.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 { ; CHECK: internal %70.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2 ; CHECK: } @@ -311,28 +311,28 @@ ; CHECK: %66.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %66.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) ; CHECK: undef %61.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub0 { ; CHECK: internal %61.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub2 ; CHECK: } ; CHECK: %61.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %61.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) ; CHECK: undef %56.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub0 { ; CHECK: internal %56.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub2 ; CHECK: } ; CHECK: %56.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %56.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) ; CHECK: undef %51.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub0 { ; CHECK: internal %51.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub2 ; CHECK: } ; CHECK: %51.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %51.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) ; CHECK: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub0 { ; CHECK: internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub2 ; CHECK: } diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -10,7 +10,7 @@ ; CHECK: bb.0..expVert: ; CHECK: liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31 ; CHECK: undef %56.sub0:sgpr_64 = COPY $sgpr31 - ; CHECK: SI_SPILL_S32_SAVE $sgpr27, %stack.2, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) + ; CHECK: SI_SPILL_S32_SAVE $sgpr27, %stack.2, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) ; CHECK: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr25 ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr5 ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr4 @@ -32,7 +32,7 @@ ; CHECK: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: SI_SPILL_S32_SAVE [[S_AND_B32_]], %stack.0, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: SI_SPILL_S32_SAVE [[S_AND_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) ; CHECK: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY4]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc @@ -46,7 +46,7 @@ ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4) - ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) + ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc ; CHECK: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc @@ -217,7 +217,7 @@ ; CHECK: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -507, implicit-def dead $scc ; CHECK: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -539, implicit-def dead $scc ; CHECK: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) + ; CHECK: [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) ; CHECK: [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[SI_SPILL_S32_RESTORE]], 96, implicit-def $scc ; CHECK: [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc @@ -336,8 +336,8 @@ ; CHECK: [[V_OR_B32_e32_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_60]], [[V_ADD_U32_e32_25]], implicit $exec ; CHECK: [[V_ADD_U32_e32_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_OR_B32_e32_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[V_OR_B32_e32_61]], [[V_ADD_U32_e32_26]], implicit $exec - ; CHECK: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) - ; CHECK: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr100_sgpr101_sgpr102_sgpr103, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) + ; CHECK: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) ; CHECK: undef %914.sub2_sub3:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub2_sub3 { ; CHECK: internal %914.sub0:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0 ; CHECK: } diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir b/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir --- a/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-nolivesubranges.mir @@ -24,9 +24,9 @@ ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; CHECK: renamable $sgpr0 = IMPLICIT_DEF ; CHECK: renamable $sgpr1 = IMPLICIT_DEF - ; CHECK: SI_SPILL_S64_SAVE renamable $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $private_rsrc_reg, implicit $sp_reg :: (store 8 into %stack.0, align 4, addrspace 5) + ; CHECK: SI_SPILL_S64_SAVE renamable $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store 8 into %stack.0, align 4, addrspace 5) ; CHECK: KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 - ; CHECK: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $private_rsrc_reg, implicit $sp_reg :: (load 8 from %stack.0, align 4, addrspace 5) + ; CHECK: renamable $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load 8 from %stack.0, align 4, addrspace 5) ; CHECK: $sgpr105 = S_AND_B32 renamable $sgpr1, renamable $sgpr1, implicit-def $scc ; CHECK: S_NOP 0, implicit $sgpr104, implicit $sgpr105 %0:sreg_64 = COPY $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -9,11 +9,11 @@ # CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, # CHECK-NEXT: stack-id: sgpr-spill, -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) +# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -110,7 +110,7 @@ ; and inserting a spill. Here we just check that the point where the error ; occurs we see a correctly generated spill. ; GCN-LABEL: bb.7: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -126,7 +126,7 @@ successors: %bb.12(0x80000000) ; GCN-LABEL: bb.9: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -137,7 +137,7 @@ successors: %bb.12(0x80000000) ; GCN-LABEL: bb.10: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.0, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir b/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir --- a/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir +++ b/llvm/test/CodeGen/AMDGPU/unexpected-reg-unit-state.mir @@ -16,9 +16,9 @@ ; CHECK: liveins: $vgpr0 ; CHECK: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec ; CHECK: renamable $sgpr4_sgpr5 = COPY $vcc - ; CHECK: SI_SPILL_S64_SAVE $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) + ; CHECK: SI_SPILL_S64_SAVE $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5) ; CHECK: renamable $sgpr4_sgpr5 = COPY $vcc - ; CHECK: $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5) + ; CHECK: $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5) ; CHECK: renamable $vgpr0 = V_CNDMASK_B32_e64 0, -1, 0, 3, killed $sgpr4_sgpr5, implicit $exec ; CHECK: S_ENDPGM 0, implicit killed $vgpr0, implicit killed renamable $vcc %0:vgpr_32 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir @@ -18,7 +18,7 @@ ; CHECK: liveins: $vgpr0 ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; CHECK: S_NOP 0, implicit $vgpr0 - SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) S_NOP 0, implicit $vgpr0 ... @@ -38,7 +38,7 @@ ; CHECK-LABEL: name: spill_v32_kill ; CHECK: liveins: $vgpr0 ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ... --- @@ -59,7 +59,7 @@ ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) ; CHECK: S_NOP 0, implicit $vgpr0_vgpr1 - SI_SPILL_V64_SAVE $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_V64_SAVE $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) S_NOP 0, implicit $vgpr0_vgpr1 ... @@ -80,7 +80,7 @@ ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... # Make sure there's no verifier error on the undef spill component when the value is killed. @@ -102,7 +102,7 @@ ; CHECK: liveins: $vgpr0 ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... --- @@ -122,5 +122,5 @@ ; CHECK: liveins: $vgpr1 ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) + SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ...