diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1435,6 +1435,7 @@ FrameInfo.getObjectAlign(FrameIndex)); unsigned SpillSize = TRI->getSpillSize(*RC); + MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); @@ -1448,7 +1449,6 @@ // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. if (SrcReg.isVirtual() && SpillSize == 4) { - MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } @@ -1467,6 +1467,17 @@ : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); + if (RI.isVectorSuperClass(RC)) { + // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an + // equivalent VGPR register beforehand. Regalloc might want to introduce + // AV spills only to be relevant until rewriter at which they become + // either spills of VGPRs or AGPRs. + Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); + BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg) + .addReg(SrcReg, RegState::Kill); + SrcReg = TmpVReg; + } + BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr @@ -1600,11 +1611,24 @@ unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) : getVGPRSpillRestoreOpcode(SpillSize); + + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); + Register TmpReg = DestReg; + if (IsVectorSuperClass) { + // For AV classes, insert the spill restore to a VGPR followed by a copy + // into an equivalent AV register. + MachineRegisterInfo &MRI = MF->getRegInfo(); + DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); + } BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); + + if (IsVectorSuperClass) + BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg) + .addReg(DestReg, RegState::Kill); } void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -77,6 +77,10 @@ return 100; } + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; + Register getFrameRegister(const MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -402,6 +402,62 @@ return CSR_AMDGPU_NoRegs_RegMask; } +const TargetRegisterClass * +SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + // FIXME: Should have a helper function like getEquivalentVGPRClass to get the + // equivalent AV class. If used one, the verifier will crash after + // RegBankSelect in the GISel flow. The aligned regclasses are not fully given + // until Instruction selection. + if (MF.getSubtarget().hasMAIInsts() && + (isVGPRClass(RC) || isAGPRClass(RC))) { + if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass) + return &AMDGPU::AV_32RegClass; + if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass) + return &AMDGPU::AV_64RegClass; + if (RC == &AMDGPU::VReg_64_Align2RegClass || + RC == &AMDGPU::AReg_64_Align2RegClass) + return &AMDGPU::AV_64_Align2RegClass; + if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass) + return &AMDGPU::AV_96RegClass; + if (RC == &AMDGPU::VReg_96_Align2RegClass || + RC == &AMDGPU::AReg_96_Align2RegClass) + return &AMDGPU::AV_96_Align2RegClass; + if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass) + return &AMDGPU::AV_128RegClass; + if (RC == &AMDGPU::VReg_128_Align2RegClass || + RC == &AMDGPU::AReg_128_Align2RegClass) + return &AMDGPU::AV_128_Align2RegClass; + if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass) + return &AMDGPU::AV_160RegClass; + if (RC == &AMDGPU::VReg_160_Align2RegClass || + RC == &AMDGPU::AReg_160_Align2RegClass) + return &AMDGPU::AV_160_Align2RegClass; + if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass) + return &AMDGPU::AV_192RegClass; + if (RC == &AMDGPU::VReg_192_Align2RegClass || + RC == &AMDGPU::AReg_192_Align2RegClass) + return &AMDGPU::AV_192_Align2RegClass; + if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass) + return &AMDGPU::AV_256RegClass; + if (RC == &AMDGPU::VReg_256_Align2RegClass || + RC == &AMDGPU::AReg_256_Align2RegClass) + return &AMDGPU::AV_256_Align2RegClass; + if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass) + return &AMDGPU::AV_512RegClass; + if (RC == &AMDGPU::VReg_512_Align2RegClass || + RC == &AMDGPU::AReg_512_Align2RegClass) + return &AMDGPU::AV_512_Align2RegClass; + if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass) + return &AMDGPU::AV_1024RegClass; + if (RC == &AMDGPU::VReg_1024_Align2RegClass || + RC == &AMDGPU::AReg_1024_Align2RegClass) + return &AMDGPU::AV_1024_Align2RegClass; + } + + return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF); +} + Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const SIFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); @@ -994,10 +1050,22 @@ unsigned Dst = IsStore ? Reg : ValueReg; unsigned Src = IsStore ? ValueReg : Reg; - unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 - : AMDGPU::V_ACCVGPR_READ_B32_e64; + bool IsVGPR = TRI->isVGPR(MRI, Reg); + DebugLoc DL = MI->getDebugLoc(); + if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) { + // Spiller during regalloc may restore a spilled register to its superclass. + // It could result in AGPR spills restored to VGPRs or the other way around, + // making the src and dst with identical regclasses at this point. It just + // needs a copy in such cases. + auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst) + .addReg(Src, getKillRegState(IsKill)); + CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); + return CopyMIB; + } + unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 + : AMDGPU::V_ACCVGPR_READ_B32_e64; - auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst) + auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst) .addReg(Src, getKillRegState(IsKill)); MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse); return MIB; diff --git a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir --- a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir +++ b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir @@ -20,7 +20,7 @@ ; CHECK: successors: %bb.1(0x80000000) ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF - ; CHECK: SI_SPILL_V1024_SAVE [[DEF1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; CHECK: [[COPY:%[0-9]+]]:av_1024_align2 = COPY [[DEF1]] ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK: S_NOP 0, implicit [[DEF1]] @@ -29,18 +29,17 @@ ; CHECK: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: [[SI_SPILL_V1024_RESTORE:%[0-9]+]]:vreg_1024_align2 = SI_SPILL_V1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5) - ; CHECK: undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 { - ; CHECK: internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 + ; CHECK: undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 { + ; CHECK: internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 ; CHECK: } - ; CHECK: %5.sub0:vreg_1024_align2 = IMPLICIT_DEF + ; CHECK: %5.sub0:av_1024_align2 = IMPLICIT_DEF ; CHECK: S_NOP 0, implicit %5.sub0 ; CHECK: bb.3: ; CHECK: successors: %bb.4(0x80000000) ; CHECK: S_NOP 0, implicit %5 ; CHECK: bb.4: ; CHECK: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; CHECK: [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF ; CHECK: S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc ; CHECK: bb.5: ; CHECK: undef %3.sub0:vreg_1024_align2 = COPY [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -0,0 +1,62 @@ +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX908 %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 -verify-machineinstrs < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s +;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog -verify-machineinstrs < %s | FileCheck -check-prefix=PEI-GFX90A %s + +; Partial reg copy and spill missed during regalloc handled later at frame lowering. +define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { + ; REGALLOC-GFX908-LABEL: name: partial_copy + ; REGALLOC-GFX908: bb.0 (%ir-block.0): + ; REGALLOC-GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def [[VREG_64:%[0-9]+]] + ; REGALLOC-GFX908: SI_SPILL_V64_SAVE [[VREG_64]], %stack.0 + ; REGALLOC-GFX908: [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 + ; REGALLOC-GFX908: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0 + ; REGALLOC-GFX908: GLOBAL_STORE_DWORDX2 undef %{{[0-9]+}}:vreg_64, [[SI_SPILL_V64_RESTORE]] + ; REGALLOC-GFX908: [[COPY_A128_TO_V128:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]] + ; REGALLOC-GFX908: GLOBAL_STORE_DWORDX4 undef %{{[0-9]+}}:vreg_64, [[COPY_A128_TO_V128]] + ; + ; PEI-GFX908-LABEL: name: partial_copy + ; PEI-GFX908: bb.0 (%ir-block.0): + ; PEI-GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2949130 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX908: BUFFER_STORE_DWORD_OFFSET killed $vgpr0 + ; PEI-GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1 + ; PEI-GFX908: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 + ; PEI-GFX908: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET + ; PEI-GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4 + ; PEI-GFX908: GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1 + ; PEI-GFX908: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; PEI-GFX908: GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; + ; REGALLOC-GFX90A-LABEL: name: partial_copy + ; REGALLOC-GFX90A: bb.0 (%ir-block.0): + ; REGALLOC-GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def [[VREG_64:%[0-9]+]] + ; REGALLOC-GFX90A: SI_SPILL_V64_SAVE [[VREG_64]], %stack.0 + ; REGALLOC-GFX90A: [[V_MFMA_I32_4X4X4I8_A128:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 + ; REGALLOC-GFX90A: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64_align2 = SI_SPILL_V64_RESTORE %stack.0 + ; REGALLOC-GFX90A: [[COPY_AV64:%[0-9]+]]:av_64_align2 = COPY [[SI_SPILL_V64_RESTORE]] + ; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX2 undef %15:vreg_64_align2, [[COPY_AV64]] + ; REGALLOC-GFX90A-NOT: %{{[0-9]+}}:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_A128]] + ; REGALLOC-GFX90A: GLOBAL_STORE_DWORDX4 undef %17:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_A128]] + ; + ; PEI-GFX90A-LABEL: name: partial_copy + ; PEI-GFX90A: bb.0 (%ir-block.0): + ; PEI-GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 + ; PEI-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0 + ; PEI-GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1 + ; PEI-GFX90A: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 + ; PEI-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET + ; PEI-GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4 + ; PEI-GFX90A: GLOBAL_STORE_DWORDX2 undef renamable ${{.*}}, killed renamable $vgpr0_vgpr1 + ; PEI-GFX90A: GLOBAL_STORE_DWORDX4 undef renamable ${{.*}}, killed renamable $agpr0_agpr1_agpr2_agpr3 + %v0 = call <4 x i32> asm sideeffect "; def $0", "=v" () + %v1 = call <2 x i32> asm sideeffect "; def $0", "=v" () + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0) + store volatile <4 x i32> %v0, <4 x i32> addrspace(1)* undef + store volatile <2 x i32> %v1, <2 x i32> addrspace(1)* undef + store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef + ret void +} + +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) + +attributes #0 = { nounwind "amdgpu-num-vgpr"="5" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -1,23 +1,17 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s -; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908-A2M,A2M %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX90A-A2M,A2M %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; GCN-LABEL: {{^}}max_24regs_32a_used: -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN-DAG: v_mfma_f32_16x16x1f32 -; GCN-DAG: v_mfma_f32_16x16x1f32 -; A2V-NOT: SCRATCH_RSRC -; GFX908-A2M-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse -; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse -; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse -; GFX90A-NOT: v_accvgpr_write_b32 -; A2V: ScratchSize: 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN-DAG: v_mfma_f32_16x16x1f32 +; GCN-DAG: v_mfma_f32_16x16x1f32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-NOT: buffer_store_dword +; GCN-NOT: buffer_load_dword +; GFX908-NOT: v_accvgpr_write_b32 +; GFX90A: v_accvgpr_write_b32 +; GCN: ScratchSize: 0 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -39,16 +33,13 @@ } ; GCN-LABEL: {{^}}max_12regs_13a_used: -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse -; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse -; A2V: ScratchSize: 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GCN-NOT: buffer_store_dword +; GCN-NOT: buffer_load_dword +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GCN: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -70,17 +61,13 @@ } ; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; GFX908-A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GFX908-A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; A2V-NOT: SCRATCH_RSRC - -; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse -; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse -; A2V: ScratchSize: 0 - -; GFX908-A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; GFX908-A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908-A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} +; GCN-NOT: buffer_store_dword +; GCN-NOT: buffer_load_dword +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] +; GCN: ScratchSize: 0 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { %a1 = call <4 x i32> asm sideeffect "", "=a"() %a2 = call <4 x i32> asm sideeffect "", "=a"() @@ -92,17 +79,14 @@ } ; GCN-LABEL: {{^}}max_32regs_mfma32: -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse -; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; GFX90A-NOT: v_accvgpr_read_b32 -; GFX90A: v_mfma_f32_32x32x1f32 -; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse -; GFX90A-NOT: v_accvgpr_write_b32 -; A2V: ScratchSize: 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN-NOT: buffer_store_dword +; GCN: v_accvgpr_read_b32 +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: buffer_load_dword +; GCN: v_accvgpr_write_b32 +; GCN: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { bb: %v = call i32 asm sideeffect "", "=a"() @@ -116,6 +100,47 @@ ret void } +; Should spill agprs to memory for both gfx908 and gfx90a. +; GCN-LABEL: {{^}}max_5regs_used_8a: +; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 + +; GFX908-DAG: v_accvgpr_read_b32 v1, a0 ; Reload Reuse +; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill +; GFX908-DAG: v_accvgpr_read_b32 v1, a1 ; Reload Reuse +; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill +; GFX908-DAG: v_accvgpr_read_b32 v1, a2 ; Reload Reuse +; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill +; GFX908-DAG: v_accvgpr_read_b32 v1, a3 ; Reload Reuse +; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill + +; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill +; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill + +; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] + +; GCN-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload + +; GCN: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off +; GCN: ScratchSize: 20 +define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v0 = call float asm sideeffect "; def $0", "=v"() + %a4 = call <4 x float> asm sideeffect "; def $0", "=a"() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid + %mai.in = load <4 x float>, <4 x float> addrspace(1)* %gep + %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0) + store <4 x float> %mai.out, <4 x float> addrspace(1)* %gep + store volatile <4 x float> %a4, <4 x float> addrspace(1)* undef + call void asm sideeffect "; use $0", "v"(float %v0); + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) @@ -125,3 +150,4 @@ attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } +attributes #4 = { nounwind "amdgpu-num-vgpr"="5" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +; Convert AV spills into VGPR spills by introducing appropriate copies in between. + +define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { + ; GCN-LABEL: name: test_spill_av_class + ; GCN: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %21.sub0 + ; GCN-NEXT: undef %23.sub0:av_64 = COPY %21.sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY %23 + ; GCN-NEXT: SI_SPILL_V64_SAVE [[COPY1]], %stack.0, $sgpr32, 0, implicit $exec + ; GCN: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:av_64 = COPY [[SI_SPILL_V64_RESTORE]] + ; GCN-NEXT: undef %22.sub0:vreg_64 = COPY [[COPY3]].sub0 + %v0 = call i32 asm sideeffect "; def $0", "=v"() + %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0) + store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef + call void asm sideeffect "; use $0", "v"(<2 x i32> %tmp); + ret void +} + +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) + +attributes #0 = { nounwind "amdgpu-num-vgpr"="5" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -5,15 +5,14 @@ ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse +; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} ; GFX900: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_store_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, ; GFX900: buffer_load_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse +; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]] +; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]] ; GCN: NumVgprs: 10 ; GFX900: ScratchSize: 12 @@ -57,19 +56,19 @@ } ; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse -; GFX908: buffer_store_dword v{{[0-9]}}, +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} +; GFX908-NOT: buffer_store_dword v{{[0-9]}}, ; GFX908-NOT: buffer_ -; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse -; GFX908: buffer_load_dword v{{[0-9]}}, +; GFX908: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]] +; GFX908: v_accvgpr_read_b32 [[V_REG]], [[A_REG]] ; GFX908-NOT: buffer_ ; GFX900: couldn't allocate input reg for constraint 'a' ; GFX908: NumVgprs: 10 -; GFX908: ScratchSize: 8 +; GFX908: ScratchSize: 0 ; GFX908: VGPRBlocks: 2 ; GFX908: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 { @@ -113,28 +112,28 @@ ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GFX908-DAG: v_accvgpr_write_b32 a0, 1 -; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse -; GFX900: buffer_store_dword v{{[0-9]}}, ; GCN-DAG: buffer_store_dword v{{[0-9]}}, -; GFX900: buffer_load_dword v{{[0-9]}}, +; GCN-DAG: buffer_store_dword v{{[0-9]}}, +; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GCN-DAG: buffer_load_dword v{{[0-9]}}, ; GCN-DAG: buffer_load_dword v{{[0-9]}}, -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; Reload Reuse -; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 +; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; GCN: NumVgprs: 10 ; GFX900: ScratchSize: 44 @@ -166,10 +165,10 @@ ; GCN-LABEL: {{^}}max_10_vgprs_spill_v32: ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse -; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse -; GCN-NOT: a10 ; GCN: buffer_store_dword v{{[0-9]}}, +; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} +; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} +; GCN-NOT: a10 ; GFX908: NumVgprs: 10 ; GFX900: ScratchSize: 100 @@ -231,23 +230,20 @@ ret void } -; FIXME: adding an AReg_1024 register class for v32f32 and v32i32 -; produces unnecessary copies and we still have some amount -; of conventional spilling. - ; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GFX908-FIXME-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_write_b32 a0, v +; GFX908-NOT: SCRATCH_RSRC +; GFX908: v_accvgpr_write_b32 +; GFX908: global_load_ ; GFX900: buffer_store_dword v ; GFX900: buffer_load_dword v -; GFX908-FIXME-NOT: buffer_ +; GFX908-NOT: buffer_ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 ; GFX900: ScratchSize: 2052 -; GFX908-FIXME: ScratchSize: 0 +; GFX908: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vector-spill-restore-to-other-vector-type.mir @@ -0,0 +1,224 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-flat-scratch -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +# A spilled register can be restored to its superclass during regalloc. +# As a result, we might see AGPR spills restored to VGPRs or the other way around. + +--- +name: partial_spill_a128_restore_to_v128_1_of_4 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr52, $vgpr53, $vgpr54, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + + ; GCN-LABEL: name: partial_spill_a128_restore_to_v128_1_of_4 + ; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN: {{ $}} + ; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s96) into %stack.0, align 4, addrspace 5) + ; GCN: $vgpr51 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 + ; GCN: $vgpr48_vgpr49_vgpr50 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr54, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 +... + +--- +name: partial_spill_a128_restore_to_v128_2_of_4 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr52, $vgpr53, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + + ; GCN-LABEL: name: partial_spill_a128_restore_to_v128_2_of_4 + ; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN: {{ $}} + ; GCN: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN: $vgpr51 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 + ; GCN: $vgpr50 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 + ; GCN: $vgpr48_vgpr49 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0, implicit $vgpr52, implicit $vgpr53, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 +... + +--- +name: partial_spill_a128_restore_to_v128_3_of_4 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr52, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + + ; GCN-LABEL: name: partial_spill_a128_restore_to_v128_3_of_4 + ; GCN: liveins: $vgpr52, $vgpr53, $vgpr54, $vgpr55, $agpr0_agpr1_agpr2_agpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN: {{ $}} + ; GCN: $vgpr53 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr54 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr55 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN: $vgpr51 = COPY $vgpr53, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 + ; GCN: $vgpr50 = COPY $vgpr54, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 + ; GCN: $vgpr49 = COPY $vgpr55, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 + ; GCN: $vgpr48 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51 :: (load (s32) from %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $vgpr48_vgpr49_vgpr50_vgpr51 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0, implicit $vgpr52, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 +... + +--- +name: full_spill_a128_restore_to_v128 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $agpr0_agpr1_agpr2_agpr3 + + ; GCN-LABEL: name: full_spill_a128_restore_to_v128 + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; GCN: {{ $}} + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GCN: $vgpr55 = COPY $vgpr0, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55 + ; GCN: $vgpr54 = COPY $vgpr1, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55 + ; GCN: $vgpr53 = COPY $vgpr2, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55 + ; GCN: $vgpr52 = COPY $vgpr3, implicit-def $vgpr52_vgpr53_vgpr54_vgpr55 + ; GCN: S_ENDPGM 0 + SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $vgpr52_vgpr53_vgpr54_vgpr55 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0 +... + +--- +name: partial_spill_v128_restore_to_a128_1_of_4 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $agpr31, $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24_agpr25 + + ; GCN-LABEL: name: partial_spill_v128_restore_to_a128_1_of_4 + ; GCN: liveins: $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr24_agpr25, $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: {{ $}} + ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s96) into %stack.0, align 4, addrspace 5) + ; GCN: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29 + ; GCN: $agpr26_agpr27_agpr28 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GCN: S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25 + SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0, implicit $agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25 +... + +--- +name: partial_spill_v128_restore_to_a128_2_of_4 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24_agpr25 + + ; GCN-LABEL: name: partial_spill_v128_restore_to_a128_2_of_4 + ; GCN: liveins: $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr24_agpr25, $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: {{ $}} + ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN: $agpr29 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29 + ; GCN: $agpr28 = COPY $agpr31, implicit-def $agpr26_agpr27_agpr28_agpr29 + ; GCN: $agpr26_agpr27 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25 + SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24_agpr25 +... + +--- +name: partial_spill_v128_restore_to_a128_3_of_4 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr24 + + ; GCN-LABEL: name: partial_spill_v128_restore_to_a128_3_of_4 + ; GCN: liveins: $agpr24, $agpr25, $agpr30, $agpr31, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: {{ $}} + ; GCN: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN: $agpr29 = COPY $agpr25, implicit-def $agpr26_agpr27_agpr28_agpr29 + ; GCN: $agpr28 = COPY $agpr30, implicit-def $agpr26_agpr27_agpr28_agpr29 + ; GCN: $agpr27 = COPY $agpr31, implicit-def $agpr26_agpr27_agpr28_agpr29 + ; GCN: $agpr26 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr26_agpr27_agpr28_agpr29 :: (load (s32) from %stack.0, addrspace 5) + ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24 + SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $agpr26_agpr27_agpr28_agpr29 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23, implicit $agpr24 +... + +--- +name: full_spill_v128_restore_to_a128 +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 16, alignment: 4 } +machineFunctionInfo: + hasSpilledVGPRs: true + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; GCN-LABEL: name: full_spill_v128_restore_to_a128 + ; GCN: liveins: $agpr4, $agpr5, $agpr6, $agpr7, $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: {{ $}} + ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; GCN: $agpr3 = COPY $agpr4, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr2 = COPY $agpr5, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr1 = COPY $agpr6, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GCN: $agpr0 = COPY $agpr7, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GCN: S_ENDPGM 0 + SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5) + $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0 +...