diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -149,6 +149,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_smrd_buffer_sgpr_imm : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization // directly before selecting a glue-less load, so hide this diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -35,12 +35,15 @@ int64_t Offset; if (Def->getOpcode() == TargetOpcode::G_ADD) { // TODO: Handle G_OR used for add case - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset))) - return std::make_pair(Def->getOperand(1).getReg(), Offset); + for (unsigned Op : {1, 2}) { + unsigned OtherOp = 3 - Op; + if (mi_match(Def->getOperand(Op).getReg(), MRI, m_ICst(Offset))) + return std::make_pair(Def->getOperand(OtherOp).getReg(), Offset); - // FIXME: matcher should ignore copies - if (mi_match(Def->getOperand(2).getReg(), MRI, m_Copy(m_ICst(Offset)))) - return std::make_pair(Def->getOperand(1).getReg(), Offset); + // FIXME: matcher should ignore copies + if (mi_match(Def->getOperand(Op).getReg(), MRI, m_Copy(m_ICst(Offset)))) + return std::make_pair(Def->getOperand(OtherOp).getReg(), Offset); + } } // Handle G_PTRTOINT (G_PTR_ADD base, const) case diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -193,11 +193,13 @@ bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue Base, SDValue ByteOffsetNode, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, + SDValue *Offset, bool Imm32Only = false, + bool IsBuffer = false) const; SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, - SDValue *Offset, bool Imm32Only = false) const; + SDValue *Offset, bool Imm32Only = false, + bool IsBuffer = false) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -205,8 +207,10 @@ bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const; bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset, SDValue &Offset) const; - bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; - bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const; + bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const; + bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, + SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1886,9 +1886,12 @@ // Match an immediate (if Imm is true) or an SGPR (if Imm is false) // offset. If Imm32Only is true, match only 32-bit immediate offsets // available on CI. -bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue Addr, SDValue ByteOffsetNode, +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset, SDValue *Offset, - bool Imm32Only) const { + bool Imm32Only, bool IsBuffer) const { + assert((!SOffset || !Offset) && + "Cannot match both soffset and offset at the same time!"); + ConstantSDNode *C = dyn_cast(ByteOffsetNode); if (!C) { if (!SOffset) @@ -1908,10 +1911,12 @@ } SDLoc SL(ByteOffsetNode); - // GFX9 and GFX10 have signed byte immediate offsets. - int64_t ByteOffset = C->getSExtValue(); + + // GFX9 and GFX10 have signed byte immediate offsets. The immediate + // offset for S_BUFFER instructions is unsigned. + int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue(); Optional EncodedOffset = - AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false); + AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer); if (EncodedOffset && Offset && !Imm32Only) { *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; @@ -1970,11 +1975,10 @@ // immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, - bool Imm32Only) const { - SDLoc SL(Addr); - + bool Imm32Only, + bool IsBuffer) const { if (SOffset && Offset) { - assert(!Imm32Only); + assert(!Imm32Only && !IsBuffer); SDValue B; return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) && SelectSMRDBaseOffset(B, SBase, SOffset, nullptr); @@ -1982,32 +1986,25 @@ // A 32-bit (address + offset) should not cause unsigned 32-bit integer // wraparound, because s_load instructions perform the addition in 64 bits. - if ((Addr.getValueType() != MVT::i32 || - Addr->getFlags().hasNoUnsignedWrap())) { - SDValue N0, N1; - // Extract the base and offset if possible. - if (CurDAG->isBaseWithConstantOffset(Addr) || - Addr.getOpcode() == ISD::ADD) { - N0 = Addr.getOperand(0); - N1 = Addr.getOperand(1); - } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { - assert(N0 && N1 && isa(N1)); - } - if (N0 && N1) { - if (SelectSMRDOffset(N0, N1, SOffset, Offset, Imm32Only)) { - SBase = N0; - return true; - } - if (SelectSMRDOffset(N1, N0, SOffset, Offset, Imm32Only)) { - SBase = N1; - return true; - } - } + if (Addr.getValueType() == MVT::i32 && !Addr->getFlags().hasNoUnsignedWrap()) return false; + + SDValue N0, N1; + // Extract the base and offset if possible. + if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) { + N0 = Addr.getOperand(0); + N1 = Addr.getOperand(1); + } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { + assert(N0 && N1 && isa(N1)); } - if (Offset && !SOffset) { - SBase = Addr; - *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + if (!N0 || !N1) + return false; + if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) { + SBase = N0; + return true; + } + if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) { + SBase = N1; return true; } return false; @@ -2016,10 +2013,18 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset, SDValue *Offset, bool Imm32Only) const { - if (!SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) - return false; - SBase = Expand32BitAddress(SBase); - return true; + if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) { + SBase = Expand32BitAddress(SBase); + return true; + } + + if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) { + SBase = Expand32BitAddress(Addr); + *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + return true; + } + + return false; } bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, @@ -2045,33 +2050,26 @@ return SelectSMRD(Addr, SBase, &SOffset, &Offset); } -bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, - SDValue &Offset) const { - if (ConstantSDNode *C = dyn_cast(Addr)) { - // The immediate offset for S_BUFFER instructions is unsigned. - if (auto Imm = - AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) { - Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); - return true; - } - } - - return false; +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const { + return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + /* Imm32Only */ false, /* IsBuffer */ true); } -bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const { assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - - if (ConstantSDNode *C = dyn_cast(Addr)) { - if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, - C->getZExtValue())) { - Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32); - return true; - } - } - - return false; + return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, + /* Imm32Only */ true, /* IsBuffer */ true); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, + SDValue &Offset) const { + // Match the (soffset + offset) pair as a 32-bit register base and + // a non-zero immediate offset. + return N.getValueType() == MVT::i32 && + SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr, + &Offset, /* Imm32Only */ false, + /* IsBuffer */ true); } bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -294,6 +294,7 @@ ComplexRendererFns selectSMRDBufferImm(MachineOperand &Root) const; ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const; + ComplexRendererFns selectSMRDBufferSgprImm(MachineOperand &Root) const; void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4875,6 +4875,26 @@ return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { + // Match the (soffset + offset) pair as a 32-bit register base and + // a non-zero immediate offset. + Register SOffset; + unsigned Offset; + std::tie(SOffset, Offset) = + AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg()); + if (!SOffset || MRI->getType(SOffset) != LLT::scalar(32) || Offset == 0) + return None; + + Optional EncodedOffset = + AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true); + if (!EncodedOffset) + return None; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1806,6 +1806,7 @@ unsigned ImmOffset; const LLT S32 = LLT::scalar(32); + // TODO: Use AMDGPU::getBaseWithConstantOffset() instead. std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), OrigOffset); diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -859,6 +859,7 @@ def SMRDSgprImm : ComplexPattern; def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; +def SMRDBufferSgprImm : ComplexPattern; multiclass SMRD_Pattern { @@ -914,9 +915,18 @@ // 3. Offset loaded in an 32bit SGPR def : GCNPat < - (SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy), - (vt (!cast(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_cpol $cachepolicy))) + (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy), + (vt (!cast(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy))) >; + + // 4. Offset as an 32-bit SGPR + immediate + def : GCNPat < + (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgprImm i32:$soffset, i32:$offset), + timm:$cachepolicy), + (vt (!cast(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset, + (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isGFX9Plus]; + } } // Global and constant loads can be selected to either MUBUF or SMRD diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll --- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll @@ -88,8 +88,32 @@ ret void } +; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset +; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0 +; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1 +; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2 +; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3 +; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4 +; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 +; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 77, +; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0 +; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1 +; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2 +; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3 +; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4 +; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 +; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 77, +define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %base, i32 inreg %i, i32 addrspace(1)* inreg %out) { + %off = add nuw nsw i32 77, %i + %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0) + store i32 %v, i32 addrspace(1)* %out, align 4 + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #1 +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) nounwind readnone willreturn + ; Function Attrs: nounwind readnone speculatable declare i32 @llvm.amdgcn.reloc.constant(metadata) #3