diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -82,6 +82,9 @@ applyMappingImage(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const; + unsigned setBufferOffsets(MachineIRBuilder &B, Register CombinedOffset, + Register &VOffsetReg, Register &SOffsetReg, + int64_t &InstOffsetVal, Align Alignment) const; bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1233,31 +1233,18 @@ return true; } -static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, - Register Reg) { - MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); - if (!Def) - return Reg; - - // TODO: Guard against this being an implicit def - return Def->getOperand(0).getReg(); -} - // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store // the three offsets (voffset, soffset and instoffset) -static unsigned setBufferOffsets(MachineIRBuilder &B, - const AMDGPURegisterBankInfo &RBI, - Register CombinedOffset, Register &VOffsetReg, - Register &SOffsetReg, int64_t &InstOffsetVal, - Align Alignment) { +unsigned AMDGPURegisterBankInfo::setBufferOffsets( + MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, + Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { const LLT S32 = LLT::scalar(32); MachineRegisterInfo *MRI = B.getMRI(); if (std::optional Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) { uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, - Alignment)) { + if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { VOffsetReg = B.buildConstant(S32, 0).getReg(0); SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); InstOffsetVal = ImmOffset; @@ -1275,9 +1262,9 @@ AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); uint32_t SOffset, ImmOffset; - if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - &RBI.Subtarget, Alignment)) { - if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + if ((int)Offset > 0 && + TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { + if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { VOffsetReg = Base; SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); @@ -1298,11 +1285,11 @@ // Handle the variable sgpr + vgpr case. MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); if (Add && (int)Offset >= 0) { - Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); - Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); + Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); + Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); - const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); - const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); + const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); + const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { VOffsetReg = Src0; @@ -1319,7 +1306,7 @@ // Ensure we have a VGPR for the combined offset. This could be an issue if we // have an SGPR offset and a VGPR resource. - if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { VOffsetReg = CombinedOffset; } else { VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); @@ -1369,8 +1356,8 @@ Register VOffset; int64_t ImmOffset = 0; - unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), - VOffset, SOffset, ImmOffset, Alignment); + unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset, + SOffset, ImmOffset, Alignment); // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we // can, but we need to track an MMO for that. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8545,12 +8545,12 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); SDLoc DL(CombinedOffset); if (auto C = dyn_cast(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, - Alignment)) { + if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -8562,8 +8562,8 @@ SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast(N1)->getSExtValue(); - if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Alignment)) { + if (Offset >= 0 && + TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1135,6 +1135,9 @@ return isUInt<12>(Imm); } + bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, + Align Alignment = Align(4)) const; + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT /// encoded instruction. If \p Signed, this is for an instruction that /// interprets the offset as signed. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7897,6 +7897,51 @@ return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } +// Given Imm, split it into the values to put into the SOffset and ImmOffset +// fields in an MUBUF instruction. Return false if it is not possible (due to a +// hardware bug needing a workaround). +// +// The required alignment ensures that individual address components remain +// aligned if they are aligned to begin with. It also ensures that additional +// offsets within the given alignment can be added to the resulting ImmOffset. +bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, + uint32_t &ImmOffset, Align Alignment) const { + const uint32_t MaxImm = alignDown(4095, Alignment.value()); + uint32_t Overflow = 0; + + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Alignment.value()) & ~4095; + uint32_t Low = (Imm + Alignment.value()) & 4095; + Imm = Low; + Overflow = High - Alignment.value(); + } + } + + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + + ImmOffset = Imm; + SOffset = Overflow; + return true; +} + // Depending on the used address space and instructions, some immediate offsets // are allowed and some are not. // In general, flat instruction offsets can only be non-negative, global and diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -26,7 +26,6 @@ struct Align; class Argument; class Function; -class GCNSubtarget; class GlobalValue; class MCInstrInfo; class MCRegisterClass; @@ -1301,10 +1300,6 @@ /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); -bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, - Align Alignment = Align(4)); - LLVM_READNONE inline bool isLegal64BitDPPControl(unsigned DC) { return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -10,15 +10,17 @@ #include "AMDGPU.h" #include "AMDGPUAsmUtils.h" #include "AMDKernelCodeT.h" -#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/CommandLine.h" @@ -2587,52 +2589,6 @@ return 13; } -// Given Imm, split it into the values to put into the SOffset and ImmOffset -// fields in an MUBUF instruction. Return false if it is not possible (due to a -// hardware bug needing a workaround). -// -// The required alignment ensures that individual address components remain -// aligned if they are aligned to begin with. It also ensures that additional -// offsets within the given alignment can be added to the resulting ImmOffset. -bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, Align Alignment) { - const uint32_t MaxImm = alignDown(4095, Alignment.value()); - uint32_t Overflow = 0; - - if (Imm > MaxImm) { - if (Imm <= MaxImm + 64) { - // Use an SOffset inline constant for 4..64 - Overflow = Imm - MaxImm; - Imm = MaxImm; - } else { - // Try to keep the same value in SOffset for adjacent loads, so that - // the corresponding register contents can be re-used. - // - // Load values with all low-bits (except for alignment bits) set into - // SOffset, so that a larger range of values can be covered using - // s_movk_i32. - // - // Atomic operations fail to work correctly when individual address - // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Alignment.value()) & ~4095; - uint32_t Low = (Imm + Alignment.value()) & 4095; - Imm = Low; - Overflow = High - Alignment.value(); - } - } - - // There is a hardware bug in SI and CI which prevents address clamping in - // MUBUF instructions from working correctly with SOffsets. The immediate - // offset is unaffected. - if (Overflow > 0 && - Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) - return false; - - ImmOffset = Imm; - SOffset = Overflow; - return true; -} - SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { *this = getDefaultForCallingConv(F.getCallingConv());