Index: lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def =================================================================== --- lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -160,6 +160,74 @@ return &ValMappingsSGPR64OnlyVGPR32[2]; } +const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] { + /* 256-bit load */ {0, 256, SGPRRegBank}, + /* 512-bit load */ {0, 512, SGPRRegBank}, + /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank}, + {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank}, + {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank}, + {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank}, + /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank}, + {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank}, + + /* FIXME: The generic register bank select does not support complex + * break downs where the number of vector elements does not equal the + * number of breakdowns. + */ + /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] { + /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1}, + /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1}, + /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8}, + /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16}, + /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4}, + /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8} +}; + +const RegisterBankInfo::ValueMapping * +getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) { + unsigned Size = SizeTy.getSizeInBits(); + if (Size < 256 || BankID == AMDGPU::SGPRRegBankID) + return getValueMapping(BankID, Size); + + assert ((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID); + + // Default to using the non-split ValueMappings, we will use these if + // the register bank is SGPR or if we don't know how to handle the vector + // type. + unsigned Idx = Size == 256 ? 0 : 1; + + // We need to split this load if it has a vgpr pointer. + if (BankID == AMDGPU::VGPRRegBankID) { + if (SizeTy == LLT::vector(8, 32)) + Idx = 2; + else if (SizeTy == LLT::vector(16, 32)) + Idx = 3; + else if (SizeTy == LLT::vector(4, 64)) + Idx = 4; + else if (SizeTy == LLT::vector(8, 64)) + Idx = 5; + } + + return &ValMappingsLoadSGPROnly[Idx]; +} + } // End AMDGPU namespace. } // End llvm namespace. Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -220,6 +220,7 @@ getActionDefinitionsBuilder({G_LOAD, G_STORE}) .legalIf([=, &ST](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; + const LLT &PtrTy = Query.Types[1]; // TODO: Decompose private loads into 4-byte components. // TODO: Illegal flat loads on SI @@ -235,7 +236,14 @@ case 256: case 512: - // TODO: constant loads + // TODO: Possibly support loads of i256 and i512 . This will require + // adding i256 and i512 types to MVT in order for to be able to use + // TableGen. + // TODO: Add support for other vector types, this will require + // defining more value mappings for the new types. + return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || + Ty0.getScalarType().getSizeInBits() == 64); + default: return false; } Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -100,6 +100,12 @@ // SGPR are illegal. assert(CurBank == nullptr && "shouldn't see already assigned bank"); + // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to + // VGPR. + // FIXME: Is there a better way to do this? + if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) + return 10; // This is expensive. + assert(ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && @@ -125,6 +131,15 @@ return getRegBank(AMDGPU::VGPRRegBankID); } +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPUInstrInfo::isUniformMMO(MMO); +} + + RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { @@ -179,29 +194,33 @@ } case TargetOpcode::G_LOAD: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); + // FIXME: Is it possible to physreg outputs at this point? If not we don't + // need to check LoadTy.isValid(). + if (!LoadTy.isValid()) + LoadTy = LLT::scalar(Size); // FIXME: Should we be hard coding the size for these mappings? - const InstructionMapping &SSMapping = getInstructionMapping( - 1, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&SSMapping); + if (isInstrUniform(MI)) { + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&SSMapping); + } const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 2); // Num Operands AltMappings.push_back(&VVMapping); - // FIXME: Should this be the pointer-size (64-bits) or the size of the - // register that will hold the bufffer resourc (128-bits). - const InstructionMapping &VSMapping = getInstructionMapping( - 3, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&VSMapping); + // It may be possible to have a vgpr = load sgpr mapping here, because + // the mubuf instructions support this kind of load, but probably for only + // gfx7 and older. However, the addressing mode matching in the instruction + // selector should be able to do a better job of detecting and selecting + // these kinds of loads from the vgpr = load vgpr mapping. return AltMappings; @@ -384,6 +403,74 @@ MI.eraseFromParent(); return; } + case AMDGPU::G_LOAD: { + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned LoadSize = MRI.getType(DstReg).getSizeInBits(); + const unsigned MaxNonSmrdLoadSize = 128; + // 128-bit loads are supported for all instruction types. + if (LoadSize <= MaxNonSmrdLoadSize) + break; + + SmallVector DefRegs(OpdMapper.getVRegs(0)); + SmallVector SrcRegs(OpdMapper.getVRegs(1)); + + // If the pointer is an SGPR, we have nothing to do. + if (SrcRegs.empty()) + break; + + assert(LoadSize % MaxNonSmrdLoadSize == 0); + + // RegBankSelect only emits scalar types, so we need to reset the pointer + // operand to a pointer type. + unsigned BasePtrReg = SrcRegs[0]; + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + MRI.setType(BasePtrReg, PtrTy); + + MachineIRBuilder B(MI); + + const RegisterBank *PtrBank = getRegBank(BasePtrReg, MRI, *TRI); + unsigned SplitRegSize = LoadSize / DefRegs.size(); + unsigned NumLoads = LoadSize / MaxNonSmrdLoadSize; + const LLT LoadTy = LLT::vector(MaxNonSmrdLoadSize / SplitRegSize, + SplitRegSize); + for (unsigned DefIdx = 0, LoadIdx = 0, Offset = 0; + LoadIdx != NumLoads; ++LoadIdx) { + unsigned PtrReg = MRI.createGenericVirtualRegister(PtrTy); + unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + unsigned LoadReg = MRI.createGenericVirtualRegister(LoadTy); + + B.buildConstant(OffsetReg, Offset); + MRI.setRegBank(OffsetReg, *PtrBank); + + B.buildGEP(PtrReg, BasePtrReg, OffsetReg); + MRI.setRegBank(PtrReg, *PtrBank); + + unsigned OffsetBytes = Offset / 8; + const MachineMemOperand &MMO = **MI.memoperands_begin(); + unsigned Alignment = MinAlign(MMO.getAlignment(), OffsetBytes); + MachineMemOperand *SplitMMO = B.getMF().getMachineMemOperand( + MMO.getPointerInfo().getWithOffset(OffsetBytes), MMO.getFlags(), + MaxNonSmrdLoadSize / 8, Alignment, MMO.getAAInfo(), MMO.getRanges(), + MMO.getSyncScopeID(), MMO.getOrdering(), MMO.getFailureOrdering()); + + B.buildLoad(LoadReg, PtrReg, *SplitMMO); + MRI.setRegBank(LoadReg, getRegBank(AMDGPU::VGPRRegBankID)); + + for (unsigned i = 0, e = MaxNonSmrdLoadSize / SplitRegSize; + i != e; ++i, ++DefIdx) { + unsigned IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + B.buildConstant(IdxReg, i); + MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); + B.buildExtractVectorElement(DefRegs[DefIdx], LoadReg, IdxReg); + } + + Offset += MaxNonSmrdLoadSize; + } + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + MI.eraseFromParent(); + return; + } default: break; } @@ -391,14 +478,6 @@ return applyDefaultMapping(OpdMapper); } -static bool isInstrUniform(const MachineInstr &MI) { - if (!MI.hasOneMemOperand()) - return false; - - const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); -} - bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -486,6 +565,11 @@ const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector OpdsMapping(MI.getNumOperands()); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); + // FIXME: Is it possible to physreg outputs at this point? If not we don't + // need to check LoadTy.isValid(). + if (!LoadTy.isValid()) + LoadTy = LLT::scalar(Size); unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); const ValueMapping *ValMapping; @@ -496,7 +580,7 @@ ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { - ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -7,6 +7,7 @@ --- | define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void } define amdgpu_kernel void @smrd_buffer_imm(i32 addrspace(7)* %const1) { ret void } + define amdgpu_kernel void @smrd_wide() { ret void } ... --- @@ -299,3 +300,32 @@ ... --- + +name: smrd_wide +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr2_vgpr3 + %0:sgpr(p4) = COPY $sgpr0_sgpr1 + %1:sgpr(p1) = COPY $sgpr2_sgpr3 + + ; CHECK: [[CONSTANT_PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[GLOBAL_PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr2_sgpr3 + ; CHECK: s_load_dwordx8 [[CONSTANT_PTR]] + %2:sgpr(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %2 + + ; CHECK: s_load_dwordx16 [[CONSTANT_PTR]] + %3:sgpr(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %3 + + ; CHECK: s_load_dwordx8 [[GLOBAL_PTR]] + %4:sgpr(<8 x s32>) = G_LOAD %1 :: (load 32, addrspace 1) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %4 + + ; CHECK s_load_dwordx16 [[GLOBAL_PTR]] + %5:sgpr(<16 x s32>) = G_LOAD %1 :: (load 64, addrspace 1) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %5 +... Index: test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir +++ test/CodeGen/AMDGPU/GlobalISel/legalize-load.mir @@ -129,3 +129,33 @@ $vgpr0_vgpr1_vgpr2 = COPY %1 ... + +--- +name: test_load_constant_v8i32 +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: test_load_global_v8i32 + ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load 256, addrspace 4) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[LOAD]](<8 x s32>) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 256, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1 +... + +--- +name: test_load_constant_v16i32 +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: test_load_global_v16i32 + ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load 512, addrspace 4) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[LOAD]](<16 x s32>) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 256, addrspace 4) + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1 +... Index: test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -0,0 +1,504 @@ +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +# REQUIRES: global-isel + +--- | + define amdgpu_kernel void @load_global_v8i32_non_uniform(<8 x i32> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v8i32 = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <8 x i32>, <8 x i32> addrspace(1)* %global.not.uniform.v8i32 + ret void + } + define amdgpu_kernel void @load_global_v4i64_non_uniform(<4 x i64> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %global.not.uniform.v4i64 + ret void + } + define amdgpu_kernel void @load_global_v16i32_non_uniform(<16 x i32> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v16i32 = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <16 x i32>, <16 x i32> addrspace(1)* %global.not.uniform.v16i32 + ret void + } + define amdgpu_kernel void @load_global_v8i64_non_uniform(<8 x i64> addrspace(1)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %global.not.uniform.v8i64 = getelementptr <8 x i64>, <8 x i64> addrspace(1)* %in, i32 %tmp0 + %tmp2 = load <8 x i64>, <8 x i64> addrspace(1)* %global.not.uniform.v8i64 + ret void + } + define amdgpu_kernel void @load_global_v8i32_uniform() {ret void} + define amdgpu_kernel void @load_global_v4i64_uniform() {ret void} + define amdgpu_kernel void @load_global_v16i32_uniform() {ret void} + define amdgpu_kernel void @load_global_v8i64_uniform() {ret void} + define amdgpu_kernel void @load_constant_v8i32_non_uniform(<8 x i32> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v8i32 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <8 x i32>, <8 x i32> addrspace(4)* %constant.not.uniform.v8i32 + ret void + } + define amdgpu_kernel void @load_constant_v4i64_non_uniform(<4 x i64> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <4 x i64>, <4 x i64> addrspace(4)* %constant.not.uniform.v4i64 + ret void + } + define amdgpu_kernel void @load_constant_v16i32_non_uniform(<16 x i32> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v16i32 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <16 x i32>, <16 x i32> addrspace(4)* %constant.not.uniform.v16i32 + ret void + } + define amdgpu_kernel void @load_constant_v8i64_non_uniform(<8 x i64> addrspace(4)* %in) { + %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 + %constant.not.uniform.v8i64 = getelementptr <8 x i64>, <8 x i64> addrspace(4)* %in, i32 %tmp0 + %tmp2 = load <8 x i64>, <8 x i64> addrspace(4)* %constant.not.uniform.v8i64 + ret void + } + define amdgpu_kernel void @load_constant_v8i32_uniform() {ret void} + define amdgpu_kernel void @load_constant_v4i64_uniform() {ret void} + define amdgpu_kernel void @load_constant_v16i32_uniform() {ret void} + define amdgpu_kernel void @load_constant_v8i64_uniform() {ret void} + declare i32 @llvm.amdgcn.workitem.id.x() #0 + attributes #0 = { nounwind readnone } +... + +--- +name : load_global_v8i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p1) = COPY [[PTR]](p1) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP0]](p1) :: (load 16 from %ir.global.not.uniform.v8i32, align 32, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP128]](p1) :: (load 16 from %ir.global.not.uniform.v8i32 + 16, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX3]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v8i32) +... + +--- +name : load_global_v4i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p1) = COPY [[PTR]](p1) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP0]](p1) :: (load 16 from %ir.global.not.uniform.v4i64, align 32, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX1]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP128]](p1) :: (load 16 from %ir.global.not.uniform.v4i64 + 16, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX1]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32) + + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v4i64) +... + +--- +name : load_global_v16i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v16i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p1) = COPY [[PTR]](p1) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP0]](p1) :: (load 16 from %ir.global.not.uniform.v16i32, align 64, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP128]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 16, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET256:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 + ; CHECK: [[GEP256:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET256]](s32) + ; CHECK: [[LOAD256:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP256]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 32, align 32, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT8:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT9:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT10:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT11:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET384:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 384 + ; CHECK: [[GEP384:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET384]](s32) + ; CHECK: [[LOAD384:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP384]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 48, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT12:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT13:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT14:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT15:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX3]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32), [[OUT8]](s32), [[OUT9]](s32), [[OUT10]](s32), [[OUT11]](s32), [[OUT12]](s32), [[OUT13]](s32), [[OUT14]](s32), [[OUT15]](s32) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v16i32) +... + +name : load_global_v8i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i64_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p1) = COPY [[PTR]](p1) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP0]](p1) :: (load 16 from %ir.global.not.uniform.v8i64, align 64, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP128]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 16, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET256:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 + ; CHECK: [[GEP256:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET256]](s32) + ; CHECK: [[LOAD256:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP256]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 32, align 32, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET384:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 384 + ; CHECK: [[GEP384:%[0-9]+]]:vgpr(p1) = G_GEP [[VPTR]], [[OFFSET384]](s32) + ; CHECK: [[LOAD384:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP384]](p1) :: (load 16 from %ir.global.not.uniform.v8i64 + 48, addrspace 1) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<2 x s64>), [[IDX1]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64), [[OUT4]](s64), [[OUT5]](s64), [[OUT6]](s64), [[OUT7]](s64) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v8i64) +... + +--- +name : load_global_v8i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i32_uniform + ; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (load 32, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 1) +... + +--- +name : load_global_v4i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v4i64_uniform + ; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (load 32, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 1) +... + +--- +name : load_global_v16i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v16i32_uniform + ; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (load 64, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 1) +... + +--- +name : load_global_v8i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_v8i64_uniform + ; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (load 64, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 1) +... + +--- +name : load_constant_v8i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p4) = COPY [[PTR]](p4) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP0]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32, align 32, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP128]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32 + 16, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX3]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v8i32) +... + +--- +name : load_constant_v4i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v4i64_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p4) = COPY [[PTR]](p4) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP0]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64, align 32, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP128]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64 + 16, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<2 x s64>), [[IDX1]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v4i64) +... + +--- +name : load_constant_v16i32_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v16i32_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p4) = COPY [[PTR]](p4) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP0]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32, align 64, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP128]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 16, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET256:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 + ; CHECK: [[GEP256:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET256]](s32) + ; CHECK: [[LOAD256:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP256]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 32, align 32, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT8:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT9:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT10:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT11:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<4 x s32>), [[IDX3]] + ; CHECK: [[OFFSET384:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 384 + ; CHECK: [[GEP384:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET384]](s32) + ; CHECK: [[LOAD384:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP384]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 48, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT12:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT13:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX1]] + ; CHECK: [[IDX2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[OUT14:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX2]] + ; CHECK: [[IDX3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[OUT15:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<4 x s32>), [[IDX3]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s32), [[OUT1]](s32), [[OUT2]](s32), [[OUT3]](s32), [[OUT4]](s32), [[OUT5]](s32), [[OUT6]](s32), [[OUT7]](s32), [[OUT8]](s32), [[OUT9]](s32), [[OUT10]](s32), [[OUT11]](s32), [[OUT12]](s32), [[OUT13]](s32), [[OUT14]](s32), [[OUT15]](s32) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v16i32) +... + +--- +name : load_constant_v8i64_non_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i64_non_uniform + ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[VPTR:%[0-9]+]]:vgpr(p4) = COPY [[PTR]](p4) + ; CHECK: [[OFFSET0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GEP0:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET0]](s32) + ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP0]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64, align 64, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT0:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT1:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD0]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET128:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 128 + ; CHECK: [[GEP128:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET128]](s32) + ; CHECK: [[LOAD128:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP128]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 16, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT2:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT3:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD128]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET256:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 + ; CHECK: [[GEP256:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET256]](s32) + ; CHECK: [[LOAD256:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP256]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 32, align 32, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT4:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT5:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD256]](<2 x s64>), [[IDX1]] + ; CHECK: [[OFFSET384:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 384 + ; CHECK: [[GEP384:%[0-9]+]]:vgpr(p4) = G_GEP [[VPTR]], [[OFFSET384]](s32) + ; CHECK: [[LOAD384:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP384]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 48, addrspace 4) + ; CHECK: [[IDX0:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[OUT6:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<2 x s64>), [[IDX0]] + ; CHECK: [[IDX1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[OUT7:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[LOAD384]](<2 x s64>), [[IDX1]] + ; CHECK: G_BUILD_VECTOR [[OUT0]](s64), [[OUT1]](s64), [[OUT2]](s64), [[OUT3]](s64), [[OUT4]](s64), [[OUT5]](s64), [[OUT6]](s64), [[OUT7]](s64) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v8i64) +... + +--- +name : load_constant_v8i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i32_uniform + ; CHECK (<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4) +... + +--- +name : load_constant_v4i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v4i64_uniform + ; CHECK (<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4) +... + +--- +name : load_constant_v16i32_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v16i32_uniform + ; CHECK (<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4) +... + +--- +name : load_constant_v8i64_uniform +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_v8i64_uniform + ; CHECK (<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4) +...