Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -161,5 +161,77 @@ return &ValMappingsSGPR64OnlyVGPR32[2]; } +const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] { + /* 256-bit load */ {0, 256, SGPRRegBank}, + /* 512-bit load */ {0, 512, SGPRRegBank}, + /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank}, + {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank}, + {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank}, + {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank}, + /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank}, + {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank}, + + /* FIXME: The generic register bank select does not support complex + * break downs where the number of vector elements does not equal the + * number of breakdowns. + * FIXME: register bank select now tries to handle complex break downs, + * but it emits an illegal instruction: + * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128) + */ + /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] { + /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1}, + /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1}, + /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8}, + /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16}, + /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4}, + /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8} +}; + +const RegisterBankInfo::ValueMapping * +getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) { + unsigned Size = SizeTy.getSizeInBits(); + if (Size < 256 || BankID == AMDGPU::SGPRRegBankID) + return getValueMapping(BankID, Size); + + assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID); + + // Default to using the non-split ValueMappings, we will use these if + // the register bank is SGPR or if we don't know how to handle the vector + // type. + unsigned Idx = Size == 256 ? 0 : 1; + + // We need to split this load if it has a vgpr pointer. + if (BankID == AMDGPU::VGPRRegBankID) { + if (SizeTy == LLT::vector(8, 32)) + Idx = 2; + else if (SizeTy == LLT::vector(16, 32)) + Idx = 3; + else if (SizeTy == LLT::vector(4, 64)) + Idx = 4; + else if (SizeTy == LLT::vector(8, 64)) + Idx = 5; + } + + return &ValMappingsLoadSGPROnly[Idx]; +} + + } // End AMDGPU namespace. } // End llvm namespace. Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -517,7 +517,14 @@ case 256: case 512: - // TODO: constant loads + // TODO: Possibly support loads of i256 and i512 . This will require + // adding i256 and i512 types to MVT in order for to be able to use + // TableGen. + // TODO: Add support for other vector types, this will require + // defining more value mappings for the new types. + return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || + Ty0.getScalarType().getSizeInBits() == 64); + default: return false; } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -44,6 +44,9 @@ void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const; + bool applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -37,22 +37,23 @@ namespace { // Observer to apply a register bank to new registers created by LegalizerHelper. -class ApplySALUMapping final : public GISelChangeObserver { +class ApplyRegBankMapping final : public GISelChangeObserver { private: MachineRegisterInfo &MRI; + const RegisterBank *NewBank; SmallVector NewInsts; public: - ApplySALUMapping(MachineRegisterInfo &MRI_) - : MRI(MRI_) {} + ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) + : MRI(MRI_), NewBank(RB) {} - ~ApplySALUMapping() { + ~ApplyRegBankMapping() { for (MachineInstr *MI : NewInsts) - applySALUBank(*MI); + applyBank(*MI); } /// Set any registers that don't have a set register class or bank to SALU. - void applySALUBank(MachineInstr &MI) { + void applyBank(MachineInstr &MI) { for (MachineOperand &Op : MI.operands()) { if (!Op.isReg()) continue; @@ -61,10 +62,13 @@ if (MRI.getRegClassOrRegBank(Reg)) continue; + const RegisterBank *RB = NewBank; // FIXME: This might not be enough to detect when SCC should be used. - const RegisterBank &RB = MRI.getType(Reg) == LLT::scalar(1) ? - AMDGPU::SCCRegBank : AMDGPU::SGPRRegBank; - MRI.setRegBank(Reg, RB); + if (MRI.getType(Reg) == LLT::scalar(1)) + RB = (NewBank == &AMDGPU::SGPRRegBank ? + &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); + + MRI.setRegBank(Reg, *RB); } } @@ -80,7 +84,6 @@ }; } - AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast(&TRI)) { @@ -128,6 +131,12 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost( const ValueMapping &ValMapping, const RegisterBank *CurBank) const { + // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to + // VGPR. + // FIXME: Is there a better way to do this? + if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) + return 10; // This is expensive. + assert(ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && @@ -302,6 +311,14 @@ } } +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPUInstrInfo::isUniformMMO(MMO); +} + RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { @@ -356,29 +373,29 @@ } case TargetOpcode::G_LOAD: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); // FIXME: Should we be hard coding the size for these mappings? - const InstructionMapping &SSMapping = getInstructionMapping( - 1, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&SSMapping); + if (isInstrUniform(MI)) { + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&SSMapping); + } const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 2); // Num Operands AltMappings.push_back(&VVMapping); - // FIXME: Should this be the pointer-size (64-bits) or the size of the - // register that will hold the bufffer resourc (128-bits). - const InstructionMapping &VSMapping = getInstructionMapping( - 3, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&VSMapping); + // It may be possible to have a vgpr = load sgpr mapping here, because + // the mubuf instructions support this kind of load, but probably for only + // gfx7 and older. However, the addressing mode matching in the instruction + // selector should be able to do a better job of detecting and selecting + // these kinds of loads from the vgpr = load vgpr mapping. return AltMappings; @@ -874,6 +891,91 @@ MI.getOperand(OpIdx).setReg(SGPR); } +// When regbankselect repairs registers, it will insert a repair instruction +// which defines the repaired register. Then it calls applyMapping and expects +// that the targets will either delete or rewrite the originally wrote to the +// repaired registers. Beccause of this, we end up in a situation where +// we have 2 instructions defining the same registers. +static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, + Register Reg, + const MachineInstr &MI) { + // Is there some way we can assert that there are exactly 2 def instructions? + for (MachineInstr &Other : MRI.def_instructions(Reg)) { + if (&Other != &MI) + return &Other; + } + + return nullptr; +} + +bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const { + Register DstReg = MI.getOperand(0).getReg(); + const LLT LoadTy = MRI.getType(DstReg); + unsigned LoadSize = LoadTy.getSizeInBits(); + const unsigned MaxNonSmrdLoadSize = 128; + // 128-bit loads are supported for all instruction types. + if (LoadSize <= MaxNonSmrdLoadSize) + return false; + + SmallVector DefRegs(OpdMapper.getVRegs(0)); + SmallVector SrcRegs(OpdMapper.getVRegs(1)); + + // If the pointer is an SGPR, we have nothing to do. + if (SrcRegs.empty()) + return false; + + assert(LoadSize % MaxNonSmrdLoadSize == 0); + + // We want to get the repair instruction now, because it will help us + // determine which instruction the legalizer inserts that will also + // write to DstReg. + MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); + + // RegBankSelect only emits scalar types, so we need to reset the pointer + // operand to a pointer type. + Register BasePtrReg = SrcRegs[0]; + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + MRI.setType(BasePtrReg, PtrTy); + + MachineIRBuilder B(MI); + + unsigned SplitElts = + MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); + const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); + ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); + GISelObserverWrapper Observer(&O); + B.setChangeObserver(Observer); + LegalizerHelper Helper(B.getMF(), Observer, B); + if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + return false; + + // At this point, the legalizer has split the original load into smaller + // loads. At the end of lowering, it inserts an instruction (LegalizedInst) + // that combines the outputs of the lower loads and writes it to DstReg. + // The register bank selector has also added the RepairInst which writes to + // DstReg as well. + + MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); + + // Replace the output of the LegalizedInst with a temporary register, since + // RepairInst already defines DstReg. + Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); + LegalizedInst->getOperand(0).setReg(TmpReg); + B.setInsertPt(*RepairInst->getParent(), RepairInst); + + for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { + Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + B.buildConstant(IdxReg, DefIdx); + MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); + B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); + } + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + return true; +} + // For cases where only a single copy is inserted for matching register banks. // Replace the register in the instruction operand static void substituteSimpleCopyRegs( @@ -1008,7 +1110,7 @@ // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. MachineFunction *MF = MI.getParent()->getParent(); MachineIRBuilder B(MI); - ApplySALUMapping ApplySALU(MRI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); LegalizerHelper Helper(*MF, Observer, B); @@ -1028,7 +1130,7 @@ MachineFunction *MF = MI.getParent()->getParent(); MachineIRBuilder B(MI); - ApplySALUMapping ApplySALU(MRI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); LegalizerHelper Helper(*MF, Observer, B); @@ -1212,6 +1314,11 @@ } break; } + case AMDGPU::G_LOAD: { + if (applyMappingWideLoad(MI, OpdMapper, MRI)) + return; + break; + } default: break; } @@ -1219,14 +1326,6 @@ return applyDefaultMapping(OpdMapper); } -static bool isInstrUniform(const MachineInstr &MI) { - if (!MI.hasOneMemOperand()) - return false; - - const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); -} - bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1322,6 +1421,7 @@ const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector OpdsMapping(MI.getNumOperands()); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); const ValueMapping *ValMapping; @@ -1332,7 +1432,7 @@ ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { - ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -4,6 +4,7 @@ --- | define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void } + define amdgpu_kernel void @smrd_wide() { ret void } ... --- @@ -155,3 +156,32 @@ ... --- + +name: smrd_wide +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + %0:sgpr(p4) = COPY $sgpr0_sgpr1 + %1:sgpr(p1) = COPY $sgpr2_sgpr3 + + ; CHECK: [[CONSTANT_PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[GLOBAL_PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3 + ; CHECK: s_load_dwordx8 [[CONSTANT_PTR]] + %2:sgpr(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %2 + + ; CHECK: s_load_dwordx16 [[CONSTANT_PTR]] + %3:sgpr(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %3 + + ; CHECK: s_load_dwordx8 [[GLOBAL_PTR]] + %4:sgpr(<8 x s32>) = G_LOAD %1 :: (load 32, addrspace 1) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %4 + + ; CHECK s_load_dwordx16 [[GLOBAL_PTR]] + %5:sgpr(<16 x s32>) = G_LOAD %1 :: (load 64, addrspace 1) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %5 +... Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir @@ -390,3 +390,33 @@ %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, addrspace 1, align 16) $vgpr0_vgpr1_vgpr2 = COPY %1 ... + +--- +name: test_load_constant_v8i32 +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: test_load_global_v8i32 + ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load 32, addrspace 4) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[LOAD]](<8 x s32>) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1 +... + +--- +name: test_load_constant_v16i32 +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: test_load_global_v16i32 + ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load 64, addrspace 4) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[LOAD]](<16 x s32>) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1 +... Index: llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir +++ llvm/trunk/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -0,0 +1,488 @@ +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +# REQUIRES: global-isel + +--- | + define amdgpu_kernel void @load_global_v8i32_non_uniform(<8 x i32> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v8i32 = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <8 x i32>, <8 x i32> addrspace(1)* %global.not.uniform.v8i32 + ret void + } + define amdgpu_kernel void @load_global_v4i64_non_uniform(<4 x i64> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %global.not.uniform.v4i64 + ret void + } + define amdgpu_kernel void @load_global_v16i32_non_uniform(<16 x i32> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v16i32 = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <16 x i32>, <16 x i32> addrspace(1)* %global.not.uniform.v16i32 + ret void + } + define amdgpu_kernel void @load_global_v8i64_non_uniform(<8 x i64> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v8i64 = getelementptr <8 x i64>, <8 x i64> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <8 x i64>, <8 x i64> addrspace(1)* %global.not.uniform.v8i64 + ret void + } + define amdgpu_kernel void @load_global_v8i32_uniform() {ret void} + define amdgpu_kernel void @load_global_v4i64_uniform() {ret void} + define amdgpu_kernel void @load_global_v16i32_uniform() {ret void} + define amdgpu_kernel void @load_global_v8i64_uniform() {ret void} + define amdgpu_kernel void @load_constant_v8i32_non_uniform(<8 x i32> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v8i32 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <8 x i32>, <8 x i32> addrspace(4)* %constant.not.uniform.v8i32 + ret void + } + define amdgpu_kernel void @load_constant_v4i64_non_uniform(<4 x i64> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <4 x i64>, <4 x i64> addrspace(4)* %constant.not.uniform.v4i64 + ret void + } + define amdgpu_kernel void @load_constant_v16i32_non_uniform(<16 x i32> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v16i32 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <16 x i32>, <16 x i32> addrspace(4)* %constant.not.uniform.v16i32 + ret void + } + define amdgpu_kernel void @load_constant_v8i64_non_uniform(<8 x i64> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v8i64 = getelementptr <8 x i64>, <8 x i64> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <8 x i64>, <8 x i64> addrspace(4)* %constant.not.uniform.v8i64 + ret void + } + define amdgpu_kernel void @load_constant_v8i32_uniform() {ret void} + define amdgpu_kernel void @load_constant_v4i64_uniform() {ret void} + define amdgpu_kernel void @load_constant_v16i32_uniform() {ret void} + define amdgpu_kernel void @load_constant_v8i64_uniform() {ret void} + declare i32 @llvm.amdgcn.workitem.id.x() #0 + attributes #0 = { nounwind readnone } +... + +--- +name : load_global_v8i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v8i32, align 32, addrspace 1) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v8i32 + 16, align 32, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX3]] + ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX4]] + ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX5]] + ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX6]] + ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX7]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v8i32) +... + +--- +name : load_global_v4i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v4i64, align 32, addrspace 1) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v4i64 + 16, align 32, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX3]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64) + + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v4i64) +... + +--- +name : load_global_v16i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v16i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v16i32, align 64, addrspace 1) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 16, align 64, addrspace 1) + ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET32]](s64) + ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP32]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 32, align 64, addrspace 1) + ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET48]](s64) + ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP48]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 48, align 64, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>), [[LOAD32]](<4 x s32>), [[LOAD48]](<4 x s32>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX3]] + ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX4]] + ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX5]] + ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX6]] + ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX7]] + ; CHECK: [[IDX8:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK: [[OUT8:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX8]] + ; CHECK: [[IDX9:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 9 + ; CHECK: [[OUT9:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX9]] + ; CHECK: [[IDX10:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK: [[OUT10:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX10]] + ; CHECK: [[IDX11:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 11 + ; CHECK: [[OUT11:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX11]] + ; CHECK: [[IDX12:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[OUT12:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX12]] + ; CHECK: [[IDX13:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 13 + ; CHECK: [[OUT13:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX13]] + ; CHECK: [[IDX14:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 14 + ; CHECK: [[OUT14:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX14]] + ; CHECK: [[IDX15:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 15 + ; CHECK: [[OUT15:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX15]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32), [[OUT8]](s32), [[OUT9]](s32), [[OUT10]](s32), [[OUT11]](s32), [[OUT12]](s32), [[OUT13]](s32), [[OUT14]](s32), [[OUT15]](s32) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v16i32) +... + +name : load_global_v8i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i64_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v8i64, align 64, addrspace 1) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 16, align 64, addrspace 1) + ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET32]](s64) + ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP32]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 32, align 64, addrspace 1) + ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p1) = G_GEP [[PTR]], [[OFFSET48]](s64) + ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP48]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 48, align 64, addrspace 1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>), [[LOAD32]](<2 x s64>), [[LOAD48]](<2 x s64>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX3]] + ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX4]] + ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX5]] + ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX6]] + ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX7]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64), [[OUT4]](s64), [[OUT5]](s64), [[OUT6]](s64), [[OUT7]](s64) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v8i64) +... + +--- +name : load_global_v8i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i32_uniform + ; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (load 32, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 1) +... + +--- +name : load_global_v4i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v4i64_uniform + ; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (load 32, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 1) +... + +--- +name : load_global_v16i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v16i32_uniform + ; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (load 64, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 1) +... + +--- +name : load_global_v8i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i64_uniform + ; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (load 64, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 1) +... + +--- +name : load_constant_v8i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32 + 16, align 32, addrspace 4) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX3]] + ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX4]] + ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX5]] + ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX6]] + ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s32>), [[IDX7]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v8i32) +... + +--- +name : load_constant_v4i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v4i64_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64 + 16, align 32, addrspace 4) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<4 x s64>), [[IDX3]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v4i64) +... + +--- +name : load_constant_v16i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v16i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 16, align 64, addrspace 4) + ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET32]](s64) + ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP32]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 32, align 64, addrspace 4) + ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET48]](s64) + ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP48]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 48, align 64, addrspace 4) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>), [[LOAD32]](<4 x s32>), [[LOAD48]](<4 x s32>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX3]] + ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX4]] + ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX5]] + ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX6]] + ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX7]] + ; CHECK: [[IDX8:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK: [[OUT8:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX8]] + ; CHECK: [[IDX9:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 + ; CHECK: [[OUT9:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX9]] + ; CHECK: [[IDX10:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK: [[OUT10:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX10]] + ; CHECK: [[IDX11:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 11 + ; CHECK: [[OUT11:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX11]] + ; CHECK: [[IDX12:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[OUT12:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX12]] + ; CHECK: [[IDX13:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 13 + ; CHECK: [[OUT13:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX13]] + ; CHECK: [[IDX14:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 14 + ; CHECK: [[OUT14:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX14]] + ; CHECK: [[IDX15:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 15 + ; CHECK: [[OUT15:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<16 x s32>), [[IDX15]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32), [[OUT8]](s32), [[OUT9]](s32), [[OUT10]](s32), [[OUT11]](s32), [[OUT12]](s32), [[OUT13]](s32), [[OUT14]](s32), [[OUT15]](s32) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v16i32) +... + +--- +name : load_constant_v8i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i64_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) + ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET16]](s64) + ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 16, align 64, addrspace 4) + ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET32]](s64) + ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP32]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 32, align 64, addrspace 4) + ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p4) = G_GEP [[PTR]], [[OFFSET48]](s64) + ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP48]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 48, align 64, addrspace 4) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>), [[LOAD32]](<2 x s64>), [[LOAD48]](<2 x s64>) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX3]] + ; CHECK: [[IDX4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX4]] + ; CHECK: [[IDX5:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX5]] + ; CHECK: [[IDX6:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX6]] + ; CHECK: [[IDX7:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD]](<8 x s64>), [[IDX7]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64), [[OUT4]](s64), [[OUT5]](s64), [[OUT6]](s64), [[OUT7]](s64) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v8i64) +... + +--- +name : load_constant_v8i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i32_uniform + ; CHECK (<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) +... + +--- +name : load_constant_v4i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v4i64_uniform + ; CHECK (<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4) +... + +--- +name : load_constant_v16i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v16i32_uniform + ; CHECK (<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) +... + +--- +name : load_constant_v8i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i64_uniform + ; CHECK (<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4) +...