diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1269,6 +1269,8 @@ if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); + assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && + SrcReg != AMDGPU::EXEC && "exec should not be spilled"); // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. @@ -1278,7 +1280,7 @@ // to make sure we are using the correct register class. if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); - MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); + MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } BuildMI(MBB, MI, DL, OpDesc) @@ -1401,13 +1403,15 @@ if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); + assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && + DestReg != AMDGPU::EXEC && "exec should not be spilled"); // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); if (DestReg.isVirtual() && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); - MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); + MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } if (RI.spillSGPRToVGPR()) @@ -7021,20 +7025,24 @@ // %0 may even spill. We can't spill $m0 normally (it would require copying to // a numbered SGPR anyway), and since it is in the SReg_32 register class, // TargetInstrInfo::foldMemoryOperand() is going to try. + // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. if (MI.isFullCopy()) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - - if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { - MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); - return nullptr; - } - - if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { - MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); - return nullptr; + if ((DstReg.isVirtual() || SrcReg.isVirtual()) && + (DstReg.isVirtual() != SrcReg.isVirtual())) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; + const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); + if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { + MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); + return nullptr; + } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { + MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); + return nullptr; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -887,68 +887,41 @@ const TargetRegisterClass *RC = getPhysRegClass(SuperReg); ArrayRef SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); - unsigned FirstPart = isWave32 ? Offset * 16 : Offset * 32; + unsigned FirstPart = Offset * 32; + unsigned ExecLane = 0; bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); - const bool SuperRegIsExec = - SuperReg == AMDGPU::EXEC || SuperReg == AMDGPU::EXEC_LO; - - // If exec mask is stored in the VGPR, make sure it is stored after - // any lanes used by the spill (16 lanes on Wave32, 32 lanes on Wave64). - const unsigned ExecLoLane = SuperRegIsExec ? 0 : (isWave32 ? 16 : 32); - const unsigned ExecHiLane = SuperRegIsExec ? 1 : (isWave32 ? 17 : 33); - - // Try to use the src/dst SGPRs to hold a copy of the exec mask. - // Use VGPR lanes when this is not possible, i.e. the src value - // must be valid after the spill or src is smaller than exec mask. - bool StoreExecInVGPR = !IsLoad && (SuperRegIsExec || !IsKill); + // Cannot handle load/store to EXEC + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); // On Wave32 only handle EXEC_LO. // On Wave64 only update EXEC_HI if there is sufficent space for a copy. - bool OnlyExecLo = isWave32 || NumSubRegs == 1; + bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI; unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC; Register SavedExecReg; // Backup EXEC - if (SuperRegIsExec) { - // Do nothing; exec is already stored in VGPR or will be overwritten - } else if (StoreExecInVGPR) { - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - VGPR) - .addReg(AMDGPU::EXEC_LO) - .addImm(ExecLoLane) - .addReg(VGPR, getUndefRegState(IsLoad)); - - if (!isWave32) { - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - VGPR) - .addReg(AMDGPU::EXEC_HI) - .addImm(ExecHiLane) - .addReg(VGPR); - } + if (OnlyExecLo) { + SavedExecReg = NumSubRegs == 1 + ? SuperReg + : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]); } else { - if (OnlyExecLo) { - SavedExecReg = NumSubRegs == 1 - ? SuperReg - : getSubReg(SuperReg, SplitParts[FirstPart]); - } else { - SavedExecReg = - getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart]), - AMDGPU::sub0, &AMDGPU::SReg_64_XEXECRegClass); - // If src/dst is an odd size it is possible subreg0 is not aligned. - if (!SavedExecReg && NumSubRegs > 2) - SavedExecReg = - getMatchingSuperReg(getSubReg(SuperReg, SplitParts[FirstPart + 1]), - AMDGPU::sub0, &AMDGPU::SReg_64_XEXECRegClass); + // If src/dst is an odd size it is possible subreg0 is not aligned. + for (; ExecLane < (NumSubRegs - 1); ++ExecLane) { + SavedExecReg = getMatchingSuperReg( + getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0, + &AMDGPU::SReg_64_XEXECRegClass); + if (SavedExecReg) + break; } - - assert(SavedExecReg); - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); } + assert(SavedExecReg); + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg); // Setup EXEC BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes); @@ -976,34 +949,34 @@ Offset * EltSize, MMO, RS); } else { - buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - Index, - VGPR, !StoreExecInVGPR, - MFI->getScratchRSrcReg(), FrameReg, - Offset * EltSize, MMO, - RS); + buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR, + IsKill, MFI->getScratchRSrcReg(), FrameReg, + Offset * EltSize, MMO, RS); // This only ever adds one VGPR spill MFI->addToSpilledVGPRs(1); } // Restore EXEC - if (SuperRegIsExec && IsLoad) { - // Do nothing; exec will be overwritten - } else if (StoreExecInVGPR) { - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - AMDGPU::EXEC_LO) - .addReg(VGPR, getKillRegState(!IsLoad && isWave32)) - .addImm(ExecLoLane); - if (!isWave32) { + BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) + .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill)); + + // Restore clobbered SGPRs + if (IsLoad) { + // Nothing to do; register will be overwritten + } else if (!IsKill) { + // Restore SGPRs from appropriate VGPR lanes + if (!OnlyExecLo) { BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - AMDGPU::EXEC_HI) - .addReg(VGPR, getKillRegState(!IsLoad)) - .addImm(ExecHiLane); + getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1])) + .addReg(VGPR) + .addImm(ExecLane + 1); } - } else { - assert(SavedExecReg); - BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg) - .addReg(SavedExecReg, RegState::Kill); + BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + NumSubRegs == 1 + ? SavedExecReg + : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane])) + .addReg(VGPR, RegState::Kill) + .addImm(ExecLane); } } @@ -1032,6 +1005,8 @@ SuperReg != MFI->getFrameOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); @@ -1069,11 +1044,12 @@ // Scavenged temporary VGPR to use. It must be scavenged once for any number // of spilled subregs. Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + RS->setRegUsed(TmpVGPR); // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); - unsigned PerVGPR = isWave32 ? 16 : 32; + unsigned PerVGPR = 32; unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; @@ -1138,6 +1114,8 @@ Register SuperReg = MI->getOperand(0).getReg(); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); + assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI && + SuperReg != AMDGPU::EXEC && "exec should never spill"); unsigned EltSize = 4; @@ -1157,14 +1135,14 @@ SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); - if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); } } else { Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + RS->setRegUsed(TmpVGPR); - unsigned PerVGPR = isWave32 ? 16 : 32; + unsigned PerVGPR = 32; unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR; int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL; @@ -1186,7 +1164,6 @@ TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(TmpVGPR, getKillRegState(LastSubReg)) .addImm(i); - if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); } diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -0,0 +1,152 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -stress-regalloc=2 -start-before=greedy -stop-after=virtregmap -o - %s | FileCheck %s + +# Test that a spill of a copy of exec is not folded to be a spill of exec directly. + +--- + +name: merge_sgpr_spill_into_copy_from_exec_lo +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo + ; CHECK: liveins: $vgpr0 + ; CHECK: S_WAITCNT 0 + ; CHECK: S_NOP 0, implicit-def $exec_lo + ; CHECK: $sgpr0 = S_MOV_B32 $exec_lo + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0 + ; CHECK: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0 + ; CHECK: $exec_lo = S_MOV_B32 killed $sgpr0 + ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec + S_NOP 0, implicit-def $exec_lo + %0:sreg_32 = COPY $exec_lo + S_NOP 0, implicit-def %1:sreg_32, implicit-def %2:sreg_32, implicit %0 + $exec_lo = COPY %0 + S_SENDMSG 0, implicit $m0, implicit $exec + +... +--- + +name: merge_sgpr_spill_into_copy_from_exec_hi +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi + ; CHECK: liveins: $vgpr0 + ; CHECK: S_WAITCNT 0 + ; CHECK: S_NOP 0, implicit-def $exec_hi + ; CHECK: $sgpr0 = S_MOV_B32 $exec_hi + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0 + ; CHECK: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0 + ; CHECK: $exec_hi = S_MOV_B32 killed $sgpr0 + ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec + S_NOP 0, implicit-def $exec_hi + %0:sreg_32 = COPY $exec_hi + S_NOP 0, implicit-def %1:sreg_32, implicit-def %2:sreg_32, implicit %0 + $exec_hi = COPY %0 + S_SENDMSG 0, implicit $m0, implicit $exec + +... +--- + +name: merge_sgpr_spill_into_copy_from_exec +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec + ; CHECK: liveins: $vgpr0 + ; CHECK: S_WAITCNT 0 + ; CHECK: S_NOP 0, implicit-def $exec + ; CHECK: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr1, 1, killed $vgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK: $sgpr1 = V_READLANE_B32_vi $vgpr0, 1 + ; CHECK: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK: $sgpr1 = V_READLANE_B32_vi killed $vgpr0, 1 + ; CHECK: $exec = S_MOV_B64 killed $sgpr0_sgpr1 + ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec + S_NOP 0, implicit-def $exec + %0:sreg_64 = COPY $exec + S_NOP 0, implicit-def %1:sreg_64, implicit-def %2:sreg_64, implicit %0 + $exec = COPY %0 + S_SENDMSG 0, implicit $m0, implicit $exec + +... + +# Test that a reload into a copy of exec is not folded to be a reload of exec directly. + +--- + +name: reload_sgpr_spill_into_copy_to_exec_lo +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo + ; CHECK: liveins: $vgpr0 + ; CHECK: S_WAITCNT 0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0 + ; CHECK: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0 + ; CHECK: $exec_lo = S_MOV_B32 killed $sgpr0 + ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec + S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo + S_NOP 0, implicit %0, implicit-def %3:sreg_32, implicit-def %4:sreg_32 + $exec_lo = COPY %0 + S_SENDMSG 0, implicit $m0, implicit $exec + +... +--- + +name: reload_sgpr_spill_into_copy_to_exec_hi +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi + ; CHECK: liveins: $vgpr0 + ; CHECK: S_WAITCNT 0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0 + ; CHECK: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0 + ; CHECK: $exec_hi = S_MOV_B32 killed $sgpr0 + ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec + S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi + S_NOP 0, implicit %0, implicit-def %3:sreg_32, implicit-def %4:sreg_32 + $exec_hi = COPY %0 + S_SENDMSG 0, implicit $m0, implicit $exec + +... +--- + +name: reload_sgpr_spill_into_copy_to_exec +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec + ; CHECK: liveins: $vgpr0 + ; CHECK: S_WAITCNT 0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0 + ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr1, 1, killed $vgpr0 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK: $sgpr1 = V_READLANE_B32_vi $vgpr0, 1 + ; CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 + ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK: $sgpr1 = V_READLANE_B32_vi killed $vgpr0, 1 + ; CHECK: $exec = S_MOV_B64 killed $sgpr0_sgpr1 + ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec + S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec + S_NOP 0, implicit %0, implicit-def %3:sreg_64, implicit-def %4:sreg_64 + $exec = COPY %0 + S_SENDMSG 0, implicit $m0, implicit $exec + +... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir @@ -8,15 +8,15 @@ # CHECK: V_WRITELANE # CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12 # S32 without kill # CHECK: V_WRITELANE -# CHECK: V_WRITELANE +# CHECK: $sgpr12 = S_MOV_B32 $exec_lo # CHECK: $exec_lo = S_MOV_B32 1 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 -# CHECK: $exec_lo = V_READLANE +# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4 +# CHECK: $sgpr12 = V_READLANE # S64 with kill # CHECK: V_WRITELANE @@ -25,20 +25,22 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 # S64 without kill # CHECK: V_WRITELANE # CHECK: V_WRITELANE -# CHECK: V_WRITELANE -# GCN64: V_WRITELANE +# GCN32: $sgpr12 = S_MOV_B32 $exec_lo +# GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 3 # GCN64: $exec = S_MOV_B64 3 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 -# CHECK: $exec_lo = V_READLANE -# GCN64: $exec_hi = V_READLANE +# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8 +# GCN32: $exec_lo = S_MOV_B32 $sgpr12 +# GCN64: $exec = S_MOV_B64 $sgpr12_sgpr13 +# GCN64: $sgpr13 = V_READLANE +# CHECK: $sgpr12 = V_READLANE # S96 # CHECK: V_WRITELANE @@ -48,7 +50,7 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 7 # GCN64: $exec = S_MOV_B64 7 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 +# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -61,7 +63,7 @@ # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec # GCN32: $exec_lo = S_MOV_B32 15 # GCN64: $exec = S_MOV_B64 15 -# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 +# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13 @@ -138,10 +140,6 @@ # CHECK: V_WRITELANE # CHECK: V_WRITELANE # CHECK: V_WRITELANE -# GCN32: $sgpr64 = S_MOV_B32 $exec_lo -# GCN32: $exec_lo = S_MOV_B32 65535 -# GCN32: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 # CHECK: V_WRITELANE # CHECK: V_WRITELANE # CHECK: V_WRITELANE @@ -158,13 +156,12 @@ # CHECK: V_WRITELANE # CHECK: V_WRITELANE # CHECK: V_WRITELANE -# GCN32: $sgpr80 = S_MOV_B32 $exec_lo +# GCN32: $sgpr64 = S_MOV_B32 $exec_lo # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN32: $exec_lo = S_MOV_B32 4294967295 # GCN64: $exec = S_MOV_B64 4294967295 -# GCN32: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 164 -# GCN64: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80 +# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 +# GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65 --- | @@ -350,7 +347,7 @@ # S1024 # GCN32: $sgpr64 = S_MOV_B32 $exec_lo # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec -# GCN32: $exec_lo = S_MOV_B32 65535 +# GCN32: $exec_lo = S_MOV_B32 4294967295 # GCN64: $exec = S_MOV_B64 4294967295 # CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64 @@ -371,10 +368,6 @@ # CHECK: $sgpr77 = V_READLANE # CHECK: $sgpr78 = V_READLANE # CHECK: $sgpr79 = V_READLANE -# GCN32: $sgpr80 = S_MOV_B32 $exec_lo -# GCN32: $exec_lo = S_MOV_B32 65535 -# GCN32: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 164 -# GCN32: $exec_lo = S_MOV_B32 killed $sgpr80 # CHECK: $sgpr80 = V_READLANE # CHECK: $sgpr81 = V_READLANE # CHECK: $sgpr82 = V_READLANE diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -1,10 +1,15 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; These tests check that the compiler won't crash when it needs to spill ; SGPRs. ; GCN-LABEL: {{^}}main: + +; Make sure there are no direct spills for EXEC registers before WQM +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, exec_lo +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, exec_hi + ; GCN: s_wqm ; Make sure not emitting unused scratch resource descriptor setup @@ -16,6 +21,13 @@ ; Writing to M0 from an SMRD instruction will hang the GPU. ; GCN-NOT: s_buffer_load_dword m0 + +; Make sure there are no direct spills/reloads for EXEC registers +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, exec_lo +; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, exec_hi +; GCN-NOT: v_readlane_b32 exec_lo +; GCN-NOT: v_readlane_b32 exec_hi + ; GCN: s_endpgm ; TOVGPR: ScratchSize: 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir --- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir @@ -7,10 +7,6 @@ ret void } - define amdgpu_kernel void @check_exec() #0 { - ret void - } - attributes #0 = { "frame-pointer"="all" } ... --- @@ -53,12 +49,12 @@ ; GFX9: $vcc = IMPLICIT_DEF ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $exec_lo, 32, $vgpr0 - ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $exec_hi, 33, $vgpr0 + ; GFX9: $vcc = S_MOV_B64 $exec ; GFX9: $exec = S_MOV_B64 3 ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX9: $exec_lo = V_READLANE_B32_vi $vgpr0, 32 - ; GFX9: $exec_hi = V_READLANE_B32_vi killed $vgpr0, 33 + ; GFX9: $exec = S_MOV_B64 $vcc + ; GFX9: $vcc_hi = V_READLANE_B32_vi $vgpr0, 1 + ; GFX9: $vcc_lo = V_READLANE_B32_vi killed $vgpr0, 0 ; GFX9: $vcc = IMPLICIT_DEF ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $vcc_hi, 1, $vgpr0, implicit killed $vcc @@ -83,12 +79,12 @@ ; GFX10: $vcc = IMPLICIT_DEF ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $vcc_hi, 1, $vgpr0, implicit $vcc - ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $exec_lo, 32, $vgpr0 - ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $exec_hi, 33, $vgpr0 + ; GFX10: $vcc = S_MOV_B64 $exec ; GFX10: $exec = S_MOV_B64 3 ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX10: $exec_lo = V_READLANE_B32_gfx10 $vgpr0, 32 - ; GFX10: $exec_hi = V_READLANE_B32_gfx10 killed $vgpr0, 33 + ; GFX10: $exec = S_MOV_B64 $vcc + ; GFX10: $vcc_hi = V_READLANE_B32_gfx10 $vgpr0, 1 + ; GFX10: $vcc_lo = V_READLANE_B32_gfx10 killed $vgpr0, 0 ; GFX10: $vcc = IMPLICIT_DEF ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $vcc_lo, 0, undef $vgpr0, implicit $vcc ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $vcc_hi, 1, $vgpr0, implicit killed $vcc @@ -110,72 +106,3 @@ $vcc = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 ... ---- -name: check_exec -tracksRegLiveness: true -liveins: - - { reg: '$sgpr4_sgpr5' } - - { reg: '$sgpr6_sgpr7' } - - { reg: '$sgpr8' } -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 8, alignment: 4 } -machineFunctionInfo: - isEntryFunction: true - waveLimiter: true - scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - argumentInfo: - privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } - dispatchPtr: { reg: '$sgpr4_sgpr5' } - kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } - workGroupIDX: { reg: '$sgpr8' } - privateSegmentWaveByteOffset: { reg: '$sgpr9' } -body: | - bb.0: - liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7 - - ; CHECK-LABEL: name: check_exec - ; CHECK: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr9 - - ; GFX9: $sgpr33 = S_MOV_B32 0 - ; GFX9: $sgpr12 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr13 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr14 = S_MOV_B32 4294967295, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr15 = S_MOV_B32 14680064, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $exec_lo, 0, undef $vgpr0, implicit $exec - ; GFX9: $vgpr0 = V_WRITELANE_B32_vi $exec_hi, 1, $vgpr0, implicit $exec - ; GFX9: $exec = S_MOV_B64 3 - ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX9: $exec_lo = V_READLANE_B32_vi $vgpr0, 0 - ; GFX9: $exec_hi = V_READLANE_B32_vi killed $vgpr0, 1 - ; GFX9: $exec = S_MOV_B64 3 - ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - ; GFX9: $exec_lo = V_READLANE_B32_vi $vgpr0, 0, implicit-def $exec - ; GFX9: $exec_hi = V_READLANE_B32_vi killed $vgpr0, 1 - - ; GFX10: $sgpr33 = S_MOV_B32 0 - ; GFX10: $sgpr96 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 - ; GFX10: $sgpr97 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 - ; GFX10: $sgpr98 = S_MOV_B32 4294967295, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 - ; GFX10: $sgpr99 = S_MOV_B32 836853760, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 - ; GFX10: $sgpr96 = S_ADD_U32 $sgpr96, $sgpr9, implicit-def $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 - ; GFX10: $sgpr97 = S_ADDC_U32 $sgpr97, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99 - ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $exec_lo, 0, undef $vgpr0, implicit $exec - ; GFX10: $vgpr0 = V_WRITELANE_B32_gfx10 $exec_hi, 1, $vgpr0, implicit $exec - ; GFX10: $exec = S_MOV_B64 3 - ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GFX10: $exec_lo = V_READLANE_B32_gfx10 $vgpr0, 0 - ; GFX10: $exec_hi = V_READLANE_B32_gfx10 killed $vgpr0, 1 - ; GFX10: $exec = S_MOV_B64 3 - ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - ; GFX10: $exec_lo = V_READLANE_B32_gfx10 $vgpr0, 0, implicit-def $exec - ; GFX10: $exec_hi = V_READLANE_B32_gfx10 killed $vgpr0, 1 - SI_SPILL_S64_SAVE $exec, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 - - $exec = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 -...