diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -54,6 +54,11 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const ArrayRef CSI, + const TargetRegisterInfo *TRI) const override; + private: void emitEntryFunctionFlatScratchInit(MachineFunction &MF, MachineBasicBlock &MBB, @@ -86,6 +91,12 @@ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const Register SGPR, const Register VGPR, const int Lane) const; + /// Create a CFI index describing a spill of a VGPR to VMEM and + /// build a MachineInstr around it. + void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned VGPR, + int64_t Offset) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1587,6 +1587,36 @@ return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint(); } +bool SIFrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const ArrayRef CSI, const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + for (const CalleeSavedInfo &CS : CSI) { + // Insert the spill to the stack frame. + unsigned Reg = CS.getReg(); + + if (CS.isSpilledToReg()) { + BuildMI(MBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), + CS.getDstReg()) + .addReg(Reg, getKillRegState(true)); + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + // If this value was already livein, we probably have a direct use of the + // incoming register value, so don't kill at the spill point. This happens + // since we pass some special inputs (workgroup IDs) in the callee saved + // range. + const bool IsLiveIn = MRI.isLiveIn(Reg); + TII->storeRegToStackSlotCFI(MBB, MBBI, Reg, !IsLiveIn, CS.getFrameIdx(), + RC, TRI); + } + } + + return true; +} + // The FP for kernels is always known 0, so we never really need to setup an // explicit register for it. However, DisableFramePointerElim will force us to // use a register for it. @@ -1698,3 +1728,42 @@ buildCFI(MBB, MBBI, DL, MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); } + +void SIFrameLowering::buildCFIForVGPRToVMEMSpill( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned VGPR, int64_t Offset) const { + MachineFunction &MF = *MBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo(); + int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false); + + SmallString<20> CFIInst; + raw_svector_ostream OSCFIInst(CFIInst); + SmallString<20> Block; + raw_svector_ostream OSBlock(Block); + + OSCFIInst << uint8_t(dwarf::DW_CFA_expression); + encodeULEB128(DwarfVGPR, OSCFIInst); + + encodeDwarfRegisterLocation(DwarfVGPR, OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_swap); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_offset_uconst); + encodeULEB128(Offset, OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_call_frame_entry_reg); + encodeULEB128(MCRI.getDwarfRegNum( + ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false), + OSBlock); + OSBlock << uint8_t(dwarf::DW_OP_deref_size); + OSBlock << uint8_t(ST.getWavefrontSize() / CHAR_BIT); + OSBlock << uint8_t(dwarf::DW_OP_LLVM_select_bit_piece); + // FIXME: Can this be a function of the VGPR? + const unsigned VGPRLaneBitSize = 32; + encodeULEB128(VGPRLaneBitSize, OSBlock); + encodeULEB128(ST.getWavefrontSize(), OSBlock); + + encodeULEB128(Block.size(), OSCFIInst); + OSCFIInst << Block; + + buildCFI(MBB, MBBI, DL, + MCCFIInstruction::createEscape(nullptr, OSCFIInst.str())); +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -212,12 +212,27 @@ MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const; +private: + void storeRegToStackSlotImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, Register SrcReg, + bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI, + bool NeedsCFI) const; + +public: void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + void storeRegToStackSlotCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, Register SrcReg, + bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1287,51 +1287,63 @@ return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); } -static unsigned getSGPRSpillSaveOpcode(unsigned Size) { +static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) { switch (Size) { case 4: - return AMDGPU::SI_SPILL_S32_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE; case 8: - return AMDGPU::SI_SPILL_S64_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE; case 12: - return AMDGPU::SI_SPILL_S96_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE; case 16: - return AMDGPU::SI_SPILL_S128_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE + : AMDGPU::SI_SPILL_S128_SAVE; case 20: - return AMDGPU::SI_SPILL_S160_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE + : AMDGPU::SI_SPILL_S160_SAVE; case 24: - return AMDGPU::SI_SPILL_S192_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE + : AMDGPU::SI_SPILL_S192_SAVE; case 32: - return AMDGPU::SI_SPILL_S256_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S256_CFI_SAVE + : AMDGPU::SI_SPILL_S256_SAVE; case 64: - return AMDGPU::SI_SPILL_S512_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE + : AMDGPU::SI_SPILL_S512_SAVE; case 128: - return AMDGPU::SI_SPILL_S1024_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE + : AMDGPU::SI_SPILL_S1024_SAVE; default: llvm_unreachable("unknown register size"); } } -static unsigned getVGPRSpillSaveOpcode(unsigned Size) { +static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) { switch (Size) { case 4: - return AMDGPU::SI_SPILL_V32_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE; case 8: - return AMDGPU::SI_SPILL_V64_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE; case 12: - return AMDGPU::SI_SPILL_V96_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE; case 16: - return AMDGPU::SI_SPILL_V128_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE + : AMDGPU::SI_SPILL_V128_SAVE; case 20: - return AMDGPU::SI_SPILL_V160_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE + : AMDGPU::SI_SPILL_V160_SAVE; case 24: - return AMDGPU::SI_SPILL_V192_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE + : AMDGPU::SI_SPILL_V192_SAVE; case 32: - return AMDGPU::SI_SPILL_V256_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE + : AMDGPU::SI_SPILL_V256_SAVE; case 64: - return AMDGPU::SI_SPILL_V512_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE + : AMDGPU::SI_SPILL_V512_SAVE; case 128: - return AMDGPU::SI_SPILL_V1024_SAVE; + return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE + : AMDGPU::SI_SPILL_V1024_SAVE; default: llvm_unreachable("unknown register size"); } @@ -1362,12 +1374,10 @@ } } -void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - Register SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void SIInstrInfo::storeRegToStackSlotImpl( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI, bool NeedsCFI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); @@ -1388,7 +1398,8 @@ // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. - const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); + const MCInstrDesc &OpDesc = + get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI)); // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. @@ -1408,8 +1419,9 @@ return; } - unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.hasAGPRs(RC) + ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize, NeedsCFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1420,6 +1432,24 @@ .addMemOperand(MMO); } +void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, false); +} + +void SIInstrInfo::storeRegToStackSlotCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, true); +} + static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -637,6 +637,13 @@ let mayLoad = 0; } + def _CFI_SAVE : PseudoInstSI < + (outs), + (ins sgpr_class:$data, i32imm:$addr)> { + let mayStore = 1; + let mayLoad = 0; + } + def _RESTORE : PseudoInstSI < (outs sgpr_class:$data), (ins i32imm:$addr)> { @@ -678,6 +685,18 @@ let Size = !if(!le(MaxSize, 256), MaxSize, 252); } + def _CFI_SAVE : VPseudoInstSI < + (outs), + (ins vgpr_class:$vdata, i32imm:$vaddr, + SReg_32:$soffset, i32imm:$offset)> { + let mayStore = 1; + let mayLoad = 0; + // (2 * 4) + (8 * num_subregs) bytes maximum + int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); + // Size field is unsigned char and cannot fit more. + let Size = !if(!le(MaxSize, 256), MaxSize, 252); + } + def _RESTORE : VPseudoInstSI < (outs vgpr_class:$vdata), (ins i32imm:$vaddr, diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -82,7 +82,7 @@ ArrayRef CSI, LiveIntervals *LIS) { MachineFunction &MF = *SaveBlock.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const SIInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -112,9 +112,8 @@ bool IsLoad) const; /// If \p OnlyToVGPR is true, this will only succeed if this - bool spillSGPR(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS, - bool OnlyToVGPR = false) const; + bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + bool OnlyToVGPR = false, bool NeedsCFI = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, @@ -319,15 +318,11 @@ ArrayRef getAllSGPR32(const MachineFunction &MF) const; private: - void buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool ValueIsKill, - MCRegister ScratchOffsetReg, - int64_t InstrOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const; + void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, + int Index, Register ValueReg, bool ValueIsKill, + MCRegister ScratchOffsetReg, int64_t InstrOffset, + MachineMemOperand *MMO, RegScavenger *RS, + bool NeedsCFI = false) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -546,22 +546,28 @@ switch (Op) { case AMDGPU::SI_SPILL_S1024_SAVE: + case AMDGPU::SI_SPILL_S1024_CFI_SAVE: case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_V1024_SAVE: + case AMDGPU::SI_SPILL_V1024_CFI_SAVE: case AMDGPU::SI_SPILL_V1024_RESTORE: case AMDGPU::SI_SPILL_A1024_SAVE: case AMDGPU::SI_SPILL_A1024_RESTORE: return 32; case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_CFI_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_CFI_SAVE: case AMDGPU::SI_SPILL_V512_RESTORE: case AMDGPU::SI_SPILL_A512_SAVE: case AMDGPU::SI_SPILL_A512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_CFI_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_CFI_SAVE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_A256_SAVE: case AMDGPU::SI_SPILL_A256_RESTORE: @@ -574,36 +580,46 @@ case AMDGPU::SI_SPILL_A192_RESTORE: return 6; case AMDGPU::SI_SPILL_S160_SAVE: + case AMDGPU::SI_SPILL_S160_CFI_SAVE: case AMDGPU::SI_SPILL_S160_RESTORE: case AMDGPU::SI_SPILL_V160_SAVE: + case AMDGPU::SI_SPILL_V160_CFI_SAVE: case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_A160_SAVE: case AMDGPU::SI_SPILL_A160_RESTORE: return 5; case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_CFI_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_CFI_SAVE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_A128_SAVE: case AMDGPU::SI_SPILL_A128_RESTORE: return 4; case AMDGPU::SI_SPILL_S96_SAVE: + case AMDGPU::SI_SPILL_S96_CFI_SAVE: case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_CFI_SAVE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_A96_SAVE: case AMDGPU::SI_SPILL_A96_RESTORE: return 3; case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_CFI_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_CFI_SAVE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_A64_SAVE: case AMDGPU::SI_SPILL_A64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_CFI_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_CFI_SAVE: case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_A32_SAVE: case AMDGPU::SI_SPILL_A32_RESTORE: @@ -774,19 +790,16 @@ return LoadStoreOp; } -void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - int Index, - Register ValueReg, - bool IsKill, - MCRegister ScratchOffsetReg, - int64_t InstOffset, - MachineMemOperand *MMO, - RegScavenger *RS) const { +void SIRegisterInfo::buildSpillLoadStore( + MachineBasicBlock::iterator MI, unsigned LoadStoreOp, int Index, + Register ValueReg, bool IsKill, MCRegister ScratchOffsetReg, + int64_t InstOffset, MachineMemOperand *MMO, RegScavenger *RS, + bool NeedsCFI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); + const SIFrameLowering *TFL = ST.getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF->getInfo(); const MCInstrDesc *Desc = &TII->get(LoadStoreOp); @@ -811,6 +824,7 @@ int64_t Offset = InstOffset + MFI.getObjectOffset(Index); int64_t MaxOffset = Offset + Size + RemSize - EltSize; int64_t ScratchOffsetRegDelta = 0; + int64_t AdditionalCFIOffset = 0; if (IsFlat && EltSize > 4) { LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); @@ -856,6 +870,7 @@ Scavenged = true; } + AdditionalCFIOffset = Offset; if (!SOffset) report_fatal_error("could not scavenge SGPR to spill in entry function"); @@ -998,6 +1013,12 @@ .addImm(0); // swz MIB.addMemOperand(NewMMO); + if (IsStore && NeedsCFI) + TFL->buildCFIForVGPRToVMEMSpill(*MBB, MI, DL, SubReg, + (Offset + RemRegOffset) * + ST.getWavefrontSize() + + AdditionalCFIOffset); + if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -1137,13 +1158,13 @@ } } -bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, - int Index, - RegScavenger *RS, - bool OnlyToVGPR) const { +bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, + RegScavenger *RS, bool OnlyToVGPR, + bool NeedsCFI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); + const SIFrameLowering *TFL = ST.getFrameLowering(); ArrayRef VGPRSpills = MFI->getSGPRToVGPRSpills(Index); @@ -1187,6 +1208,9 @@ .addImm(Spill.Lane) .addReg(Spill.VGPR); + if (NeedsCFI) + TFL->buildCFIForSGPRToVGPRSpill(*MBB, MI, DL, SubReg, Spill.VGPR, + Spill.Lane); if (i == 0 && NumSubRegs > 1) { // We may be spilling a super-register which is only partially defined, // and need to ensure later spills think the value is defined. @@ -1245,6 +1269,8 @@ // Write out VGPR buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes, RS, false); + + // TODO: Implement CFI for SpillToVMEM if/when it is fully supported. } } @@ -1339,7 +1365,18 @@ MachineBasicBlock::iterator MI, int FI, RegScavenger *RS) const { + bool NeedsCFI = false; switch (MI->getOpcode()) { + case AMDGPU::SI_SPILL_S1024_CFI_SAVE: + case AMDGPU::SI_SPILL_S512_CFI_SAVE: + case AMDGPU::SI_SPILL_S256_CFI_SAVE: + case AMDGPU::SI_SPILL_S160_CFI_SAVE: + case AMDGPU::SI_SPILL_S128_CFI_SAVE: + case AMDGPU::SI_SPILL_S96_CFI_SAVE: + case AMDGPU::SI_SPILL_S64_CFI_SAVE: + case AMDGPU::SI_SPILL_S32_CFI_SAVE: + NeedsCFI = true; + LLVM_FALLTHROUGH; case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: @@ -1349,7 +1386,7 @@ case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, true, NeedsCFI); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -1384,19 +1421,32 @@ ? getBaseRegister() : getFrameRegister(*MF); + bool NeedsCFI = false; + switch (MI->getOpcode()) { // SGPR register spill - case AMDGPU::SI_SPILL_S1024_SAVE: - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S192_SAVE: - case AMDGPU::SI_SPILL_S160_SAVE: - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S96_SAVE: - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S32_SAVE: { - spillSGPR(MI, Index, RS); - break; + case AMDGPU::SI_SPILL_S1024_CFI_SAVE: + case AMDGPU::SI_SPILL_S512_CFI_SAVE: + case AMDGPU::SI_SPILL_S256_CFI_SAVE: + case AMDGPU::SI_SPILL_S160_CFI_SAVE: + case AMDGPU::SI_SPILL_S128_CFI_SAVE: + case AMDGPU::SI_SPILL_S96_CFI_SAVE: + case AMDGPU::SI_SPILL_S64_CFI_SAVE: + case AMDGPU::SI_SPILL_S32_CFI_SAVE: { + NeedsCFI = true; + LLVM_FALLTHROUGH; + } + case AMDGPU::SI_SPILL_S1024_SAVE: + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S192_SAVE: + case AMDGPU::SI_SPILL_S160_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: { + spillSGPR(MI, Index, RS); + break; } // SGPR register restore @@ -1414,6 +1464,16 @@ } // VGPR register spill + case AMDGPU::SI_SPILL_V1024_CFI_SAVE: + case AMDGPU::SI_SPILL_V512_CFI_SAVE: + case AMDGPU::SI_SPILL_V256_CFI_SAVE: + case AMDGPU::SI_SPILL_V160_CFI_SAVE: + case AMDGPU::SI_SPILL_V128_CFI_SAVE: + case AMDGPU::SI_SPILL_V96_CFI_SAVE: + case AMDGPU::SI_SPILL_V64_CFI_SAVE: + case AMDGPU::SI_SPILL_V32_CFI_SAVE: + NeedsCFI = true; + LLVM_FALLTHROUGH; case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: @@ -1439,13 +1499,10 @@ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; - buildSpillLoadStore(MI, Opc, - Index, - VData->getReg(), VData->isKill(), - FrameReg, - TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), - *MI->memoperands_begin(), - RS); + buildSpillLoadStore( + MI, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), + *MI->memoperands_begin(), RS, NeedsCFI); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MI->eraseFromParent(); break; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -11,7 +11,7 @@ ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, 64, v0 +; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill @@ -25,6 +25,7 @@ ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, 64, v0 ; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GCN-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 @@ -34,7 +35,6 @@ ; GCN-NEXT: v_mov_b32_e32 v17, s5 ; GCN-NEXT: v_mov_b32_e32 v16, s4 ; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill @@ -333,7 +333,7 @@ ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, 64, v0 +; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill @@ -347,6 +347,7 @@ ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v12, vcc, 64, v0 ; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc ; GCN-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 @@ -356,7 +357,6 @@ ; GCN-NEXT: v_mov_b32_e32 v17, s5 ; GCN-NEXT: v_mov_b32_e32 v16, s4 ; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill @@ -660,7 +660,7 @@ ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_add_u32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 +; GCN-NEXT: s_add_u32 s32, s32, 0x14000 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -676,6 +676,7 @@ ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc ; GCN-NEXT: global_load_dwordx4 v[7:10], v[3:4], off offset:16 @@ -696,7 +697,6 @@ ; GCN-NEXT: v_lshrrev_b32_e64 v62, 6, s33 ; GCN-NEXT: v_add_u32_e32 v62, 0x100, v62 ; GCN-NEXT: v_add_u32_e32 v2, 16, v62 -; GCN-NEXT: s_add_u32 s32, s32, 0x14000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x14000 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -239,9 +239,9 @@ ; GCN: s_waitcnt ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; MUBUF-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 ; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8 @@ -251,11 +251,9 @@ ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_add_u32 s32, s32, 0x300 -; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 +; MUBUF: s_sub_u32 s32, s32, 0x300 ; MUBUF-NEXT: s_mov_b32 s33, s4 -; FLATSCR: s_add_u32 s32, s32, 12 -; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 +; FLATSCR: s_sub_u32 s32, s32, 12 ; FLATSCR-NEXT: s_mov_b32 s33, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 @@ -271,18 +269,15 @@ ; GCN: s_waitcnt ; GCN-NEXT: v_writelane_b32 v1, s33, 63 ; GCN: s_mov_b32 s33, s32 -; GCN-COUNT-60: v_writelane_b32 v1 -; GCN-COUNT-1: v_writelane_b32 v1 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GCN-COUNT-60: v_writelane_b32 v1 +; GCN-COUNT-1: v_writelane_b32 v1 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1 -; MUBUF: s_add_u32 s32, s32, 0x300 ; MUBUF: s_sub_u32 s32, s32, 0x300 -; FLATSCR: s_add_u32 s32, s32, 12 ; FLATSCR: s_sub_u32 s32, s32, 12 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -308,21 +303,18 @@ ; GCN: s_waitcnt ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-COUNT-62: v_writelane_b32 v1, -; GCN: v_writelane_b32 v1, ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GCN-COUNT-62: v_writelane_b32 v1, +; GCN: v_writelane_b32 v1, ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: v_writelane_b32 v1, ; MUBUF: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload -; MUBUF: s_add_u32 s32, s32, 0x300 -; FLATSCR: s_add_u32 s32, s32, 12 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; MUBUF-NEXT: s_sub_u32 s32, s32, 0x300 -; FLATSCR-NEXT: s_sub_u32 s32, s32, 12 +; MUBUF: s_sub_u32 s32, s32, 0x300 +; FLATSCR: s_sub_u32 s32, s32, 12 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -522,6 +522,102 @@ ret void } +; CHECK-LABEL: func_spill_vgpr_to_vmem: +; CHECK: .cfi_startproc + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: %bb.0: +; SGPR32 = 64 +; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6 +; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill + +; DW_CFA_expression [0x10] +; VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14] +; BLOCK_LENGTH ULEB128(14)=[0x0e] +; DW_OP_regx [0x90] +; VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(256)=[0x80, 0x02] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; DW_OP_deref_size [0x94] +; SIZE [0x08] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x40] +; WAVE64-NEXT: .cfi_escape 0x10, 0xa8, 0x14, 0x0e, 0x90, 0xa8, 0x14, 0x16, 0xe4, 0x80, 0x02, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40 + +; DW_CFA_expression [0x10] +; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] +; BLOCK_LENGTH ULEB128(14)=[0x0e] +; DW_OP_regx [0x90] +; VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(128)=[0x80, 0x01] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; DW_OP_deref_size [0x94] +; SIZE [0x04] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0xa8, 0x0c, 0x0e, 0x90, 0xa8, 0x0c, 0x16, 0xe4, 0x80, 0x01, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill + +; DW_CFA_expression [0x10] +; VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14] +; BLOCK_LENGTH ULEB128(13)=[0x0d] +; DW_OP_regx [0x90] +; VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0)=[0x00] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave64 ULEB128(17)=[0x11] +; DW_OP_deref_size [0x94] +; SIZE [0x08] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x40] +; WAVE64-NEXT: .cfi_escape 0x10, 0xa9, 0x14, 0x0d, 0x90, 0xa9, 0x14, 0x16, 0xe4, 0x00, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40 + +; DW_CFA_expression [0x10] +; VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c] +; BLOCK_LENGTH ULEB128(13)=[0x0d] +; DW_OP_regx [0x90] +; VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c] +; DW_OP_swap [0x16] +; DW_OP_LLVM_offset_uconst [0xe4] +; OFFSET ULEB128(0)=[0x00] +; DW_OP_LLVM_call_frame_entry_reg [0xe6] +; EXEC_MASK_wave32 ULEB128(1)=[0x01] +; DW_OP_deref_size [0x94] +; SIZE [0x04] +; DW_OP_LLVM_select_bit_piece [0xec] +; ELEMENT_SIZE [0x20] +; ELEMENT_COUNT [0x20] +; WAVE32-NEXT: .cfi_escape 0x10, 0xa9, 0x0c, 0x0d, 0x90, 0xa9, 0x0c, 0x16, 0xe4, 0x00, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20 + +; CHECK-NOT: .cfi_{{.*}} + +; CHECK: .cfi_endproc +define hidden void @func_spill_vgpr_to_vmem() #0 { +entry: + call void asm sideeffect "; clobber", "~{v40}"() #0 + call void asm sideeffect "; clobber", "~{v41}"() #0 + ret void +} + ; NOTE: Number of VGPRs available to kernel, and in turn number of corresponding CFIs generated, ; is dependent on waves/WG size. Since the intent here is to check whether we generate the correct ; CFIs, doing it for any one set of details is sufficient which also makes the test insensitive to diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2874,10 +2874,10 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v42, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -2910,17 +2910,17 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v42, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v41, v1 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5568,6 +5568,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 18 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s36, 0 ; GFX9-NEXT: v_writelane_b32 v40, s37, 1 ; GFX9-NEXT: v_writelane_b32 v40, s38, 2 @@ -5579,22 +5580,21 @@ ; GFX9-NEXT: v_writelane_b32 v40, s44, 8 ; GFX9-NEXT: v_writelane_b32 v40, s45, 9 ; GFX9-NEXT: v_writelane_b32 v40, s46, 10 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s47, 11 ; GFX9-NEXT: v_writelane_b32 v40, s48, 12 ; GFX9-NEXT: v_writelane_b32 v40, s49, 13 ; GFX9-NEXT: v_writelane_b32 v40, s50, 14 ; GFX9-NEXT: v_writelane_b32 v40, s51, 15 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 ; GFX9-NEXT: v_mov_b32_e32 v2, s48 @@ -5654,9 +5654,8 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s36, 0 ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 @@ -5672,16 +5671,17 @@ ; GFX10-NEXT: v_writelane_b32 v40, s49, 13 ; GFX10-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 @@ -5746,6 +5746,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 18 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s36, 0 ; GFX9-NEXT: v_writelane_b32 v40, s37, 1 ; GFX9-NEXT: v_writelane_b32 v40, s38, 2 @@ -5758,19 +5759,22 @@ ; GFX9-NEXT: v_writelane_b32 v40, s45, 9 ; GFX9-NEXT: v_writelane_b32 v40, s46, 10 ; GFX9-NEXT: v_writelane_b32 v40, s47, 11 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s22, s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s48, 12 ; GFX9-NEXT: v_writelane_b32 v40, s49, 13 ; GFX9-NEXT: v_writelane_b32 v40, s50, 14 ; GFX9-NEXT: v_writelane_b32 v40, s51, 15 +; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s22, s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 ; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 @@ -5793,10 +5797,6 @@ ; GFX9-NEXT: s_mov_b32 s27, s43 ; GFX9-NEXT: s_mov_b32 s28, s44 ; GFX9-NEXT: s_mov_b32 s29, s45 -; GFX9-NEXT: v_writelane_b32 v40, s31, 17 -; GFX9-NEXT: s_getpc_b64 s[30:31] -; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s4, v40, 16 @@ -5835,17 +5835,12 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s36, 0 ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 ; GFX10-NEXT: v_writelane_b32 v40, s40, 4 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s22 ; GFX10-NEXT: v_writelane_b32 v40, s41, 5 ; GFX10-NEXT: v_writelane_b32 v40, s42, 6 ; GFX10-NEXT: v_writelane_b32 v40, s43, 7 @@ -5858,15 +5853,20 @@ ; GFX10-NEXT: v_writelane_b32 v40, s50, 14 ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, s22 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -13,10 +13,10 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -47,13 +47,13 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s35, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -109,9 +109,9 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s31 @@ -146,12 +146,12 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s31 @@ -188,9 +188,9 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND @@ -225,17 +225,17 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v41, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v40, v31 +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_mov_b32_e32 v31, v40 ; GFX10-NEXT: ;;#ASMSTART @@ -268,9 +268,9 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s33, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 @@ -303,15 +303,15 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s33, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s33, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART @@ -343,9 +343,9 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 @@ -378,15 +378,15 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 1 @@ -418,9 +418,9 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 @@ -453,9 +453,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v41, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 @@ -691,9 +691,9 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s40, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 @@ -726,15 +726,15 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 1 @@ -766,10 +766,10 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v41, s40, 0 -; GFX9-NEXT: v_writelane_b32 v41, s30, 1 ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s40, 0 +; GFX9-NEXT: v_writelane_b32 v41, s30, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -810,16 +810,16 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v41, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v41, s40, 0 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v41, s40, 0 +; GFX10-NEXT: v_writelane_b32 v41, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -194,22 +194,22 @@ ; GFX9-NEXT: v_writelane_b32 v43, s33, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v43, s35, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s35, 1 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, v1 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_writelane_b32 v43, s30, 2 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 ; GFX9-NEXT: v_writelane_b32 v43, s31, 3 ; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, v40 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-csr-vgpr-spill.ll @@ -30,10 +30,10 @@ ; CHECK-NEXT: s_mov_b32 s8, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_u32 s32, s32, 0x400 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: ;;#ASMSTART @@ -77,11 +77,11 @@ ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v1, s33, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: v_writelane_b32 v1, s33, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir --- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir @@ -2,9 +2,13 @@ # CHECK-LABEL: name: empty_entry_block # CHECK: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK-NEXT: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK-NEXT: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK-NEXT: V_WRITELANE +# CHECK-NEXT: CFI_INSTRUCTION # CHECK: V_READLANE # CHECK-NEXT: V_READLANE # CHECK-NEXT: V_READLANE