diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1375,13 +1375,15 @@ AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS); // Don't fold null pointer. if (Imm != NullPtr) { - SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + SDValue HighBits = + CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32); MachineSDNode *MovHighBits = CurDAG->getMachineNode( AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); VAddr = SDValue(MovHighBits, 0); SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); + ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i16); return true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4195,9 +4195,10 @@ // TODO: Should this be inside the render function? The iterator seems to // move. + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), HighBits) - .addImm(Offset & ~4095); + .addImm(Offset & ~MaxOffset); return {{[=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); @@ -4211,7 +4212,7 @@ MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(Offset & 4095); + MIB.addImm(Offset & MaxOffset); }}}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4255,7 +4255,7 @@ std::pair AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { - const unsigned MaxImm = 4095; + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); Register BaseReg; unsigned ImmOffset; const LLT S32 = LLT::scalar(32); @@ -4268,13 +4268,14 @@ if (MRI.getType(BaseReg).isPointer()) BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store. - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. + // If the immediate value is too big for the immoffset field, put only bits + // that would normally fit in the immoffset field. The remaining value that + // is copied/added for the voffset field is a large power of 2, and it + // stands more chance of being CSEd with the copy/add for another similar + // load/store. + // However, do not do that rounding down if that is a negative + // number, as it appears to be illegal to have a negative offset in the + // vgpr, even if adding the immediate offset makes it positive. unsigned Overflow = ImmOffset & ~MaxImm; ImmOffset -= Overflow; if ((int32_t)Overflow < 0) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1791,7 +1791,7 @@ std::pair AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { - const unsigned MaxImm = 4095; + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); Register BaseReg; unsigned ImmOffset; const LLT S32 = LLT::scalar(32); @@ -1802,13 +1802,14 @@ unsigned C1 = 0; if (ImmOffset != 0) { - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store. - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. + // If the immediate value is too big for the immoffset field, put only bits + // that would normally fit in the immoffset field. The remaining value that + // is copied/added for the voffset field is a large power of 2, and it + // stands more chance of being CSEd with the copy/add for another similar + // load/store. + // However, do not do that rounding down if that is a negative + // number, as it appears to be illegal to have a negative offset in the + // vgpr, even if adding the immediate offset makes it positive. unsigned Overflow = ImmOffset & ~MaxImm; ImmOffset -= Overflow; if ((int32_t)Overflow < 0) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8495,7 +8495,7 @@ std::pair SITargetLowering::splitBufferOffsets( SDValue Offset, SelectionDAG &DAG) const { SDLoc DL(Offset); - const unsigned MaxImm = 4095; + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); SDValue N0 = Offset; ConstantSDNode *C1 = nullptr; @@ -8508,13 +8508,14 @@ if (C1) { unsigned ImmOffset = C1->getZExtValue(); - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store. - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. + // If the immediate value is too big for the immoffset field, put only bits + // that would normally fit in the immoffset field. The remaining value that + // is copied/added for the voffset field is a large power of 2, and it + // stands more chance of being CSEd with the copy/add for another similar + // load/store. + // However, do not do that rounding down if that is a negative + // number, as it appears to be illegal to have a negative offset in the + // vgpr, even if adding the immediate offset makes it positive. unsigned Overflow = ImmOffset & ~MaxImm; ImmOffset -= Overflow; if ((int32_t)Overflow < 0) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1135,6 +1135,8 @@ return isUInt<12>(Imm); } + static unsigned getMaxMUBUFImmOffset(); + bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment = Align(4)) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7874,6 +7874,8 @@ } } +unsigned SIInstrInfo::getMaxMUBUFImmOffset() { return (1 << 12) - 1; } + void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { if (!ST.isWave32()) return; @@ -7906,7 +7908,8 @@ // offsets within the given alignment can be added to the resulting ImmOffset. bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment) const { - const uint32_t MaxImm = alignDown(4095, Alignment.value()); + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value()); uint32_t Overflow = 0; if (Imm > MaxImm) { @@ -7924,8 +7927,8 @@ // // Atomic operations fail to work correctly when individual address // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Alignment.value()) & ~4095; - uint32_t Low = (Imm + Alignment.value()) & 4095; + uint32_t High = (Imm + Alignment.value()) & ~MaxOffset; + uint32_t Low = (Imm + Alignment.value()) & MaxOffset; Imm = Low; Overflow = High - Alignment.value(); }