Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUGISel.td @@ -12,6 +12,10 @@ include "AMDGPU.td" +def p0 : PtrValueType; +def p1 : PtrValueType; +def p4 : PtrValueType; + def sd_vsrc0 : ComplexPattern; def gi_vsrc0 : GIComplexOperandMatcher, @@ -34,6 +38,18 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_smrd_imm : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_smrd_imm32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_smrd_sgpr : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + class GISelSop2Pat < SDPatternOperator node, Instruction inst, @@ -134,3 +150,11 @@ def : GISelVop3Pat2ModsPat ; defm : GISelVop2IntrPat ; def : GISelVop3Pat2ModsPat ; + +// Since GlobalISel is more flexible then SelectionDAG, I think we can get +// away with adding patterns for integer types and not legalizing all +// loads and stores to vector types. This should help simplify the load/store +// legalization. +foreach Ty = [i64, p0, p1, p4] in { + defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; +} Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -57,6 +57,7 @@ GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } }; + bool isInstrUniform(const MachineInstr &MI) const; /// tblgen-erated 'select' implementation. bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; @@ -89,6 +90,13 @@ InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdImm32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSmrdSgpr(MachineOperand &Root) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -422,7 +422,7 @@ getAddrModeInfo(*PtrMI, MRI, AddrInfo); } -static bool isInstrUniform(const MachineInstr &MI) { +bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { if (!MI.hasOneMemOperand()) return false; @@ -444,52 +444,6 @@ return I && I->getMetadata("amdgpu.uniform"); } -static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) { - - if (LoadSize == 32) - return BaseOpcode; - - switch (BaseOpcode) { - case AMDGPU::S_LOAD_DWORD_IMM: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM; - } - break; - case AMDGPU::S_LOAD_DWORD_IMM_ci: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_IMM_ci; - case 128: - return AMDGPU::S_LOAD_DWORDX4_IMM_ci; - case 256: - return AMDGPU::S_LOAD_DWORDX8_IMM_ci; - case 512: - return AMDGPU::S_LOAD_DWORDX16_IMM_ci; - } - break; - case AMDGPU::S_LOAD_DWORD_SGPR: - switch (LoadSize) { - case 64: - return AMDGPU::S_LOAD_DWORDX2_SGPR; - case 128: - return AMDGPU::S_LOAD_DWORDX4_SGPR; - case 256: - return AMDGPU::S_LOAD_DWORDX8_SGPR; - case 512: - return AMDGPU::S_LOAD_DWORDX16_SGPR; - } - break; - } - llvm_unreachable("Invalid base smrd opcode or size"); -} - bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef AddrInfo) const { for (const GEPInfo &GEPInfo : AddrInfo) { if (!GEPInfo.VgprParts.empty()) @@ -498,81 +452,6 @@ return false; } -bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, - ArrayRef AddrInfo) const { - - if (!I.hasOneMemOperand()) - return false; - - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS && - (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return false; - - if (!isInstrUniform(I)) - return false; - - if (hasVgprParts(AddrInfo)) - return false; - - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - const GCNSubtarget &Subtarget = MF->getSubtarget(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = I.getOperand(0).getReg(); - const DebugLoc &DL = I.getDebugLoc(); - unsigned Opcode; - unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); - - if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { - - const GEPInfo &GEPInfo = AddrInfo[0]; - - unsigned PtrReg = GEPInfo.SgprParts[0]; - int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); - if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - - if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && - isUInt<32>(EncodedImm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(EncodedImm) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - - if (isUInt<32>(GEPInfo.Imm)) { - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); - unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addReg(OffsetReg) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); - } - } - - unsigned PtrReg = I.getOperand(1).getReg(); - Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); - MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) - .addReg(PtrReg) - .addImm(0) - .addImm(0); // glc - return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); -} - - bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); @@ -587,11 +466,6 @@ getAddrModeInfo(I, MRI, AddrInfo); - if (selectSMRD(I, AddrInfo)) { - I.eraseFromParent(); - return true; - } - switch (LoadSize) { default: llvm_unreachable("Load size not supported\n"); @@ -644,6 +518,8 @@ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); case TargetOpcode::G_LOAD: + if (selectImpl(I, CoverageInfo)) + return true; return selectG_LOAD(I); case TargetOpcode::G_STORE: return selectG_STORE(I); @@ -694,3 +570,82 @@ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods }}; } + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + + if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm)) + return None; + + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*Root.getParent(), MRI, AddrInfo); + + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + unsigned PtrReg = GEPInfo.SgprParts[0]; + int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm); + if (!isUInt<32>(EncodedImm)) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + SmallVector AddrInfo; + getAddrModeInfo(*MI, MRI, AddrInfo); + + // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, + // then we can select all ptr + 32-bit offsets not just immediate offsets. + if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + return None; + + const GEPInfo &GEPInfo = AddrInfo[0]; + if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm)) + return None; + + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + unsigned PtrReg = GEPInfo.SgprParts[0]; + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } + }}; +} Index: llvm/trunk/lib/Target/AMDGPU/SMInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SMInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SMInstructions.td @@ -682,7 +682,22 @@ // Scalar Memory Patterns //===----------------------------------------------------------------------===// -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>; +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> { + let GISelPredicateCode = [{ + if (!MI.hasOneMemOperand()) + return false; + if (!isInstrUniform(MI)) + return false; + + // FIXME: We should probably be caching this. + SmallVector AddrInfo; + getAddrModeInfo(MI, MRI, AddrInfo); + + if (hasVgprParts(AddrInfo)) + return false; + return true; + }]; +} def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; @@ -710,6 +725,12 @@ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_SGPR") $sbase, $offset, 0)) >; + + // 4. No offset + def : GCNPat < + (vt (smrd_load (i64 SReg_64:$sbase))), + (vt (!cast(Instr#"_IMM") i64:$sbase, 0, 0)) + >; } multiclass SMLoad_Pattern { Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -25,19 +25,19 @@ # VI: S_LOAD_DWORD_IMM [[PTR]], 1020, 0 # Immediate overflow for SI -# SI: [[K1024:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 +# SI: [[K1024:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1024 # SI: S_LOAD_DWORD_SGPR [[PTR]], [[K1024]], 0 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 256, 0 # VI: S_LOAD_DWORD_IMM [[PTR]], 1024, 0 # Max immediate offset for VI -# SI: [[K1048572:%[0-9]+]]:sreg_32 = S_MOV_B32 1048572 +# SI: [[K1048572:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048572 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262143 # VI: S_LOAD_DWORD_IMM [[PTR]], 1048572 # # Immediate overflow for VI -# SIVI: [[K1048576:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 +# SIVI: [[K1048576:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 # SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K1048576]], 0 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0 @@ -69,7 +69,7 @@ # GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 # Max 32-bit byte offset -# SIVI: [[K4294967292:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 +# SIVI: [[K4294967292:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4294967292 # SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K4294967292]], 0 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0 @@ -87,6 +87,14 @@ # SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0 +# Pointer loads +# GCN: [[AS0:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0 +# GCN: $sgpr0_sgpr1 = COPY [[AS0]] +# GCN: [[AS1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0 +# GCN: $sgpr0_sgpr1 = COPY [[AS1]] +# GCN: [[AS4:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0 +# GCN: $sgpr0_sgpr1 = COPY [[AS4]] + body: | bb.0: liveins: $sgpr0_sgpr1 @@ -95,48 +103,57 @@ %1:sgpr(s64) = G_CONSTANT i64 4 %2:sgpr(p4) = G_GEP %0, %1 - %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0) + %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %3 %4:sgpr(s64) = G_CONSTANT i64 1020 %5:sgpr(p4) = G_GEP %0, %4 - %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0) + %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %6 %7:sgpr(s64) = G_CONSTANT i64 1024 %8:sgpr(p4) = G_GEP %0, %7 - %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0) + %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %9 %10:sgpr(s64) = G_CONSTANT i64 1048572 %11:sgpr(p4) = G_GEP %0, %10 - %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0) + %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %12 %13:sgpr(s64) = G_CONSTANT i64 1048576 %14:sgpr(p4) = G_GEP %0, %13 - %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0) + %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %15 %16:sgpr(s64) = G_CONSTANT i64 17179869180 %17:sgpr(p4) = G_GEP %0, %16 - %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0) + %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %18 %19:sgpr(s64) = G_CONSTANT i64 17179869184 %20:sgpr(p4) = G_GEP %0, %19 - %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0) + %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %21 %22:sgpr(s64) = G_CONSTANT i64 4294967292 %23:sgpr(p4) = G_GEP %0, %22 - %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0) + %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %24 %25:sgpr(s64) = G_CONSTANT i64 4294967296 %26:sgpr(p4) = G_GEP %0, %25 - %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0) + %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0, addrspace 4) $sgpr0 = COPY %27 + %28:sgpr(p0) = G_LOAD %0 :: (load 8 from %ir.const0, addrspace 4) + $sgpr0_sgpr1 = COPY %28 + + %29:sgpr(p1) = G_LOAD %0 :: (load 8 from %ir.const0, addrspace 4) + $sgpr0_sgpr1 = COPY %29 + + %30:sgpr(p4) = G_LOAD %0 :: (load 8 from %ir.const0, addrspace 4) + $sgpr0_sgpr1 = COPY %30 + ... ---