Index: lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- lib/Target/AMDGPU/AMDGPUGISel.td +++ lib/Target/AMDGPU/AMDGPUGISel.td @@ -35,6 +35,18 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_smrd_imm : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_smrd_imm32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_smrd_sgpr : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + class GISelSop2Pat < SDPatternOperator node, Instruction inst, @@ -136,3 +148,9 @@ def : GISelVop3Pat2ModsPat ; defm : GISelVop2IntrPat ; def : GISelVop3Pat2ModsPat ; + +// Since GlobalISel is more flexible then SelectionDAG, I think we can get +// away with adding patterns for integer types and not leaglizing all +// loads and stores to vector types. This should help simplify the load/store +// legalization. +defm : SMRD_Pattern <"S_LOAD_DWORDX2", i64>; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -58,6 +58,7 @@ GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } }; + bool isInstrUniform(const MachineInstr &MI) const; /// tblgen-erated 'select' implementation. bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; @@ -90,6 +91,13 @@ InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdSgpr(MachineOperand &Root) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -423,7 +423,7 @@ getAddrModeInfo(*PtrMI, MRI, AddrInfo); } -static bool isInstrUniform(const MachineInstr &MI) { +bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { if (!MI.hasOneMemOperand()) return false; @@ -445,52 +445,6 @@ return I && I->getMetadata("amdgpu.uniform"); } -static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { - - if (LoadSize == 32) - return BaseOpcode; - - switch (BaseOpcode) { - case AMDGPU::S_LOAD_DWORD_IMM: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM; - } - break; - case AMDGPU::S_LOAD_DWORD_IMM_ci: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM_ci; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM_ci; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM_ci; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM_ci; - } - break; - case AMDGPU::S_LOAD_DWORD_SGPR: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_SGPR; - case 128: - return AMDGPU::S_LOAD_DWORDX4_SGPR; - case 256: - return AMDGPU::S_LOAD_DWORDX8_SGPR; - case 512: - return AMDGPU::S_LOAD_DWORDX16_SGPR; - } - break; - } - llvm_unreachable("Invalid base smrd opcode or size"); -} - bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef AddrInfo) const { for (const GEPInfo &GEPInfo : AddrInfo) { if (!GEPInfo.VgprParts.empty()) @@ -499,81 +453,6 @@ return false; } -bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, - ArrayRef AddrInfo) const { - - if (!I.hasOneMemOperand()) - return false; - - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS && - (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return false; - - if (!isInstrUniform(I)) - return false; - - if (hasVgprParts(AddrInfo)) - return false; - - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = I.getOperand(0).getReg(); - const DebugLoc &DL = I.getDebugLoc(); - unsigned Opcode; - unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); - - if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { - - const GEPInfo &GEPInfo = AddrInfo[0]; - - unsigned PtrReg = GEPInfo.SgprParts[0]; - int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); - if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - - if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && - isUInt<32>(EncodedImm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - - if (isUInt<32>(GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addReg(OffsetReg) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - } - - unsigned PtrReg = I.getOperand(1).getReg(); - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(0) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); -} - - bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); @@ -588,11 +467,6 @@ getAddrModeInfo(I, MRI, AddrInfo); - if (selectSMRD(I, AddrInfo)) { - I.eraseFromParent(); - return true; - } - switch (LoadSize) { default: llvm_unreachable("Load size not supported\n"); @@ -645,6 +519,8 @@ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); case TargetOpcode::G_LOAD: + if (selectImpl(I, CoverageInfo)) + return true; return selectG_LOAD(I); case TargetOpcode::G_STORE: return selectG_STORE(I); @@ -695,3 +571,82 @@ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods }}; } + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + + if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) + return None; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + if (!isUInt<32>(EncodedImm)) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*MI, MRI, AddrInfo); + + // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, + // then we can select all ptr + 32-bit offsets not just immediate offsets. + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) + return None; + + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + unsigned PtrReg = GEPInfo.SgprParts[0]; + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } + }}; +} Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -28,6 +28,9 @@ public: AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; }; } // End llvm namespace. #endif Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" @@ -126,6 +127,9 @@ .legalIf([=, &ST](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; + if (Query.Opcode == G_LOAD && Ty0.isPointer()) + return false; + // TODO: Decompose private loads into 4-byte components. // TODO: Illegal flat loads on SI switch (Ty0.getSizeInBits()) { @@ -144,7 +148,12 @@ default: return false; } - }); + }) + .customIf([](const LegalityQuery &Query) { + // TODO: Tablegen can't handle loads/stores of pointer types so we + // need to custom lower these. + return Query.Opcode == G_LOAD && Query.Types[0].isPointer(); + }); @@ -212,3 +221,31 @@ computeTables(); verify(*ST.getInstrInfo()); } + +bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MIRBuilder.setInstr(MI); + MachineFunction &MF = *MI.getParent()->getParent(); + switch (MI.getOpcode()) { + default: + // No idea what to do. + return false; + case TargetOpcode::G_LOAD: { + unsigned DstReg = MI.getOperand(0).getReg(); + LLT LoadTy = MRI.getType(DstReg); + if (!LoadTy.isPointer()) + return false; + unsigned LoadSize = LoadTy.getSizeInBits(); + assert(LoadSize % 32 == 0); + + LLT IntTy = LLT::scalar(LoadSize); + unsigned IntReg = MRI.createGenericVirtualRegister(IntTy); + MIRBuilder.buildInstr(TargetOpcode::G_LOAD, IntReg, + MI.getOperand(1).getReg())->cloneMemRefs(MF, MI); + MIRBuilder.buildCast(DstReg, IntReg); + MI.eraseFromParent(); + return true; + } + } +} Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -684,7 +684,22 @@ // Scalar Memory Patterns //===----------------------------------------------------------------------===// -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>; +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> { + let GISelPredicateCode = [{ + if (!MI.hasOneMemOperand()) + return false; + if (!isInstrUniform(MI)) + return false; + + // FIXME: We should probably be caching this + SmallVector AddrInfo; + getAddrModeInfo(MI, MRI, AddrInfo); + + if (hasVgprParts(AddrInfo)) + return false; + return true; + }]; +} def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; @@ -712,6 +727,12 @@ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_SGPR") $sbase, $offset, 0)) >; + + // 4. No offset + def : GCNPat < + (vt (smrd_load (i64 SReg_64:$sbase))), + (vt (!cast(Instr#"_IMM") i64:$sbase, 0, 0)) + >; } multiclass SMLoad_Pattern { Index: test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir +++ test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir @@ -41,7 +41,8 @@ ; CHECK-LABEL: name: test_load_global_p1 ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1) + ; CHECK: [[LOAD_INT:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:_(p1) = G_INTTOPTR [[LOAD_INT]] ; CHECK: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p1) = G_LOAD %0 :: (load 8, addrspace 1) @@ -57,7 +58,8 @@ ; CHECK-LABEL: name: test_load_global_p4 ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1) + ; CHECK: [[LOAD_INT:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:_(p4) = G_INTTOPTR [[LOAD_INT]] ; CHECK: $vgpr0_vgpr1 = COPY [[LOAD]](p4) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, addrspace 1) @@ -74,7 +76,8 @@ ; CHECK-LABEL: name: test_load_global_p3 ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p1) :: (load 4, addrspace 1) + ; CHECK: [[LOAD_INT:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 4, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:_(p3) = G_INTTOPTR [[LOAD_INT]] ; CHECK: $vgpr0 = COPY [[LOAD]](p3) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p3) = G_LOAD %0 :: (load 4, addrspace 1)