Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -76,6 +76,7 @@ applyMappingImage(MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const; + bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const; Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1240,6 +1240,227 @@ return true; } +static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, + Register Reg) { + MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (!Def) + return Reg; + + // TODO: Guard against this being an implicit def + return Def->getOperand(0).getReg(); +} + +// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store +// the three offsets (voffset, soffset and instoffset) +static unsigned setBufferOffsets(MachineIRBuilder &B, + const AMDGPURegisterBankInfo &RBI, + Register CombinedOffset, + Register &VOffsetReg, + Register &SOffsetReg, + int64_t &InstOffsetVal, + unsigned Align) { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo *MRI = B.getMRI(); + + if (Optional Imm = getConstantVRegVal(CombinedOffset, *MRI)) { + uint32_t SOffset, ImmOffset; + if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, + &RBI.Subtarget, Align)) { + VOffsetReg = B.buildConstant(S32, 0).getReg(0); + SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); + InstOffsetVal = ImmOffset; + + B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); + B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); + return SOffset + ImmOffset; + } + } + + Register Base; + unsigned Offset; + MachineInstr *Unused; + + std::tie(Base, Offset, Unused) + = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); + + uint32_t SOffset, ImmOffset; + if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + &RBI.Subtarget, Align)) { + if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + VOffsetReg = Base; + SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); + B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); + InstOffsetVal = ImmOffset; + return 0; // XXX - Why is this 0? + } + + // If we have SGPR base, we can use it for soffset. + if (SOffset == 0) { + VOffsetReg = B.buildConstant(S32, 0).getReg(0); + B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); + SOffsetReg = Base; + InstOffsetVal = ImmOffset; + return 0; // XXX - Why is this 0? + } + } + + // Handle the variable sgpr + vgpr case. + if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) { + Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); + Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); + + const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); + const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); + + if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { + VOffsetReg = Src0; + SOffsetReg = Src1; + return 0; + } + + if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { + VOffsetReg = Src1; + SOffsetReg = Src0; + return 0; + } + } + + // Ensure we have a VGPR for the combined offset. This could be an issue if we + // have an SGPR offset and a VGPR resource. + if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + VOffsetReg = CombinedOffset; + } else { + VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); + B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); + } + + SOffsetReg = B.buildConstant(S32, 0).getReg(0); + B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); + return 0; +} + +static LLT divideLLT(LLT Ty, int Factor) { + if (Ty.isVector()) + return LLT::vector(Ty.getNumElements() / Factor, Ty.getElementType()); + return LLT::scalar(Ty.getSizeInBits() / Factor); +} + +bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( + const OperandsMapper &OpdMapper) const { + MachineInstr &MI = OpdMapper.getMI(); + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + + const LLT S32 = LLT::scalar(32); + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + + const RegisterBank *RSrcBank = + OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + const RegisterBank *OffsetBank = + OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + if (RSrcBank == &AMDGPU::SGPRRegBank && + OffsetBank == &AMDGPU::SGPRRegBank) + return true; // Legal mapping + + // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back + // here but don't have an MMO. + + unsigned LoadSize = Ty.getSizeInBits(); + int NumLoads = 1; + if (LoadSize == 256 || LoadSize == 512) { + NumLoads = LoadSize / 128; + Ty = divideLLT(Ty, NumLoads); + } + + // Use the alignment to ensure that the required offsets will fit into the + // immediate offsets. + const unsigned Align = NumLoads > 1 ? 16 * NumLoads : 4; + + MachineIRBuilder B(MI); + MachineFunction &MF = B.getMF(); + + Register SOffset; + Register VOffset; + int64_t ImmOffset = 0; + + unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(3).getReg(), + VOffset, SOffset, ImmOffset, Align); + + // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we + // can, but we neeed to track an MMO for that. + const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; + const unsigned MemAlign = 4; // FIXME: ABI type alignment? + MachineMemOperand *BaseMMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + MemSize, MemAlign); + if (MMOOffset != 0) + BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); + + // If only the offset is divergent, emit a MUBUF buffer load instead. We can + // assume that the buffer is unswizzled. + + Register RSrc = MI.getOperand(2).getReg(); + Register VIndex = B.buildConstant(S32, 0).getReg(0); + B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); + + SmallVector LoadParts(NumLoads); + + MachineBasicBlock::iterator MII = MI.getIterator(); + MachineInstrSpan Span(MII, &B.getMBB()); + + for (int i = 0; i < NumLoads; ++i) { + if (NumLoads == 1) { + LoadParts[i] = Dst; + } else { + LoadParts[i] = MRI.createGenericVirtualRegister(Ty); + MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); + } + + MachineMemOperand *MMO = BaseMMO; + if (i != 0) + BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); + + B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) + .addDef(LoadParts[i]) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset + 16 * i) // offset(imm) + .addImm(0) // cachepolicy, swizzled buffer(imm) + .addImm(0) // idxen(imm) + .addMemOperand(MMO); + } + + if (RSrcBank != &AMDGPU::SGPRRegBank) { + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + SmallSet OpsToWaterfall; + + OpsToWaterfall.insert(RSrc); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + } + + if (NumLoads != 1) { + if (Ty.isVector()) + B.buildConcatVectors(Dst, LoadParts); + else + B.buildMerge(Dst, LoadParts); + } + + // We removed the instruction earlier with a waterfall loop. + if (RSrcBank == &AMDGPU::SGPRRegBank) + MI.eraseFromParent(); + + return true; +} + // FIXME: Duplicated from LegalizerHelper static CmpInst::Predicate minMaxToCompare(unsigned Opc) { switch (Opc) { @@ -2274,8 +2495,7 @@ case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { - // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS - executeInWaterfallLoop(MI, MRI, { 2, 3 }); + applyMappingSBufferLoad(OpdMapper); return; } case Intrinsic::amdgcn_readlane: { @@ -3246,24 +3466,20 @@ } case Intrinsic::amdgcn_s_buffer_load: { // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS - Register RSrc = MI.getOperand(2).getReg(); // SGPR - Register Offset = MI.getOperand(3).getReg(); // SGPR/imm - - unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); - unsigned Size2 = MRI.getType(RSrc).getSizeInBits(); - unsigned Size3 = MRI.getType(Offset).getSizeInBits(); - - unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI); - unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI); - - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0); - OpdsMapping[1] = nullptr; // intrinsic id // Lie and claim everything is legal, even though some need to be // SGPRs. applyMapping will have to deal with it as a waterfall loop. - OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc - OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3); - OpdsMapping[4] = nullptr; + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); + + // We need to convert this to a MUBUF if either the resource of offset is + // VGPR. + unsigned RSrcBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); + unsigned OffsetBank = OpdsMapping[3]->BreakDown[0].RegBank->getID(); + unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); + + unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); break; } case Intrinsic::amdgcn_div_scale: { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -30,27 +30,12 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 ; CHECK-LABEL: name: buffer_load_sv - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[V_READFIRSTLANE_B32_]](s32), 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 %2:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %1, 0 @@ -70,14 +55,17 @@ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -88,7 +76,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY2]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -114,15 +102,17 @@ ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr4 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -133,11 +123,8 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[V_READFIRSTLANE_B32_4]](s32), 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; CHECK: .2: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -0,0 +1,1588 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s + +; Natural mapping +define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: s_buffer_load_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret i32 %val +} + +define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>) + ; CHECK: $sgpr0 = COPY [[UV]](s32) + ; CHECK: $sgpr1 = COPY [[UV1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <2 x i32> %val +} + +define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v3i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 + ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[INT]](<4 x s32>), 0 + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) + ; CHECK: $sgpr0 = COPY [[UV]](s32) + ; CHECK: $sgpr1 = COPY [[UV1]](s32) + ; CHECK: $sgpr2 = COPY [[UV2]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 + %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <3 x i32> %val +} + +define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v8i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[INT]](<8 x s32>) + ; CHECK: $sgpr0 = COPY [[UV]](s32) + ; CHECK: $sgpr1 = COPY [[UV1]](s32) + ; CHECK: $sgpr2 = COPY [[UV2]](s32) + ; CHECK: $sgpr3 = COPY [[UV3]](s32) + ; CHECK: $sgpr4 = COPY [[UV4]](s32) + ; CHECK: $sgpr5 = COPY [[UV5]](s32) + ; CHECK: $sgpr6 = COPY [[UV6]](s32) + ; CHECK: $sgpr7 = COPY [[UV7]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 + %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x i32> %val +} + +define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v16i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[INT]](<16 x s32>) + ; CHECK: $sgpr0 = COPY [[UV]](s32) + ; CHECK: $sgpr1 = COPY [[UV1]](s32) + ; CHECK: $sgpr2 = COPY [[UV2]](s32) + ; CHECK: $sgpr3 = COPY [[UV3]](s32) + ; CHECK: $sgpr4 = COPY [[UV4]](s32) + ; CHECK: $sgpr5 = COPY [[UV5]](s32) + ; CHECK: $sgpr6 = COPY [[UV6]](s32) + ; CHECK: $sgpr7 = COPY [[UV7]](s32) + ; CHECK: $sgpr8 = COPY [[UV8]](s32) + ; CHECK: $sgpr9 = COPY [[UV9]](s32) + ; CHECK: $sgpr10 = COPY [[UV10]](s32) + ; CHECK: $sgpr11 = COPY [[UV11]](s32) + ; CHECK: $sgpr12 = COPY [[UV12]](s32) + ; CHECK: $sgpr13 = COPY [[UV13]](s32) + ; CHECK: $sgpr14 = COPY [[UV14]](s32) + ; CHECK: $sgpr15 = COPY [[UV15]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 + %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <16 x i32> %val +} + +; Check cases that need to be converted to MUBUF due to the offset being a VGPR. +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v2f32_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 8, align 4) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <2 x float> %val +} + +define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v3f32_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[EXTRACT:%[0-9]+]]:vgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_BUFFER_LOAD]](<4 x s32>), 0 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <3 x float> %val +} + +define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v4f32_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <4 x float> %val +} + +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: $vgpr8 = COPY [[UV8]](s32) + ; CHECK: $vgpr9 = COPY [[UV9]](s32) + ; CHECK: $vgpr10 = COPY [[UV10]](s32) + ; CHECK: $vgpr11 = COPY [[UV11]](s32) + ; CHECK: $vgpr12 = COPY [[UV12]](s32) + ; CHECK: $vgpr13 = COPY [[UV13]](s32) + ; CHECK: $vgpr14 = COPY [[UV14]](s32) + ; CHECK: $vgpr15 = COPY [[UV15]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <16 x float> %val +} + +define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_i96_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128) + ; CHECK: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store 12 into `i96 addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0) + store i96 %val, i96 addrspace(1)* undef + ret void +} + +; Test split of a wide scalar +define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_i256_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256) + ; CHECK: G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i256 addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i256 addrspace(1)* undef` + 16, align 8, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0) + store i256 %val, i256 addrspace(1)* undef + ret void +} + +; Test split of a wide scalar +define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_i512_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512) + ; CHECK: G_STORE [[UV]](s128), [[DEF]](p1) :: (store 16 into `i512 addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 16, align 8, addrspace 1) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; CHECK: G_STORE [[UV2]](s128), [[PTR_ADD1]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 32, align 8, addrspace 1) + ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; CHECK: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store 16 into `i512 addrspace(1)* undef` + 48, align 8, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0) + store i512 %val, i512 addrspace(1)* undef + ret void +} + +; Test split of a vector with 16-bit elements +define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v16i16_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; CHECK: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<16 x i16> addrspace(1)* undef` + 16, align 32, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0) + store <16 x i16> %val, <16 x i16> addrspace(1)* undef + ret void +} + +; Test split of a vector with 16-bit elements +define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v32i16_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) + ; CHECK: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 16, align 64, addrspace 1) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; CHECK: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 32, align 64, addrspace 1) + ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; CHECK: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store 16 into `<32 x i16> addrspace(1)* undef` + 48, align 64, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0) + store <32 x i16> %val, <32 x i16> addrspace(1)* undef + ret void +} + +; Test split of a vector with 64-bit elements +define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v4i64_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>) + ; CHECK: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i64> addrspace(1)* undef` + 16, align 32, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0) + store <4 x i64> %val, <4 x i64> addrspace(1)* undef + ret void +} + +; Test split of a vector with 64-bit elements +define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v8i64_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>) + ; CHECK: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 16, align 64, addrspace 1) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; CHECK: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 32, align 64, addrspace 1) + ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; CHECK: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i64> addrspace(1)* undef` + 48, align 64, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0) + store <8 x i64> %val, <8 x i64> addrspace(1)* undef + ret void +} + +; Test split of a vector with 64-bit pointer elements +define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v4p1_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>) + ; CHECK: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef`, align 32, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<4 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 32, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) + store <4 x i8 addrspace(1)*> %val, <4 x i8 addrspace(1)*> addrspace(1)* undef + ret void +} + +; Test split of a vector with 64-bit pointer elements +define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) { + ; CHECK-LABEL: name: s_buffer_load_v8p1_vgpr_offset + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>) + ; CHECK: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef`, align 64, addrspace 1) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16 + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64) + ; CHECK: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 16, align 64, addrspace 1) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64) + ; CHECK: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 32, align 64, addrspace 1) + ; CHECK: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64) + ; CHECK: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store 16 into `<8 x i8 addrspace(1)*> addrspace(1)* undef` + 48, align 64, addrspace 1) + ; CHECK: S_ENDPGM 0 + %val = call <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32> %rsrc, i32 %soffset, i32 0) + store <8 x i8 addrspace(1)*> %val, <8 x i8 addrspace(1)*> addrspace(1)* undef + ret void +} + +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %soffset = add i32 %soffset.base, 4092 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %soffset = add i32 %soffset.base, 4095 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %soffset = add i32 %soffset.base, 4096 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +; Make sure the base offset is added to each split load. +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %soffset.base, 4064 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +; Make sure the maximum offset isn't exeeded when splitting this +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %soffset.base, 4068 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: $vgpr8 = COPY [[UV8]](s32) + ; CHECK: $vgpr9 = COPY [[UV9]](s32) + ; CHECK: $vgpr10 = COPY [[UV10]](s32) + ; CHECK: $vgpr11 = COPY [[UV11]](s32) + ; CHECK: $vgpr12 = COPY [[UV12]](s32) + ; CHECK: $vgpr13 = COPY [[UV13]](s32) + ; CHECK: $vgpr14 = COPY [[UV14]](s32) + ; CHECK: $vgpr15 = COPY [[UV15]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + %soffset = add i32 %soffset.base, 4032 + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <16 x float> %val +} + +define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: $vgpr8 = COPY [[UV8]](s32) + ; CHECK: $vgpr9 = COPY [[UV9]](s32) + ; CHECK: $vgpr10 = COPY [[UV10]](s32) + ; CHECK: $vgpr11 = COPY [[UV11]](s32) + ; CHECK: $vgpr12 = COPY [[UV12]](s32) + ; CHECK: $vgpr13 = COPY [[UV13]](s32) + ; CHECK: $vgpr14 = COPY [[UV14]](s32) + ; CHECK: $vgpr15 = COPY [[UV15]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 + %soffset = add i32 %soffset.base, 4036 + %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <16 x float> %val +} + +; Waterfall loop due to resource being VGPR +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %8(s32), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +; Use the offset inside the waterfall loop +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %18, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %soffset = add i32 %soffset.base, 4092 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +; Scalar offset exceeds MUBUF limit, keep add out of the loop +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %19, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %soffset = add i32 %soffset.base, 4096 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret float %val +} + +; Waterfall loop, but constant offset +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0) + ret float %val +} + +; Waterfall loop, but constant offset +define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) { + ; CHECK-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %17, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %7(s32), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0) + ret float %val +} + +; Need a waterfall loop, but the offset is scalar. +; Make sure the base offset is added to each split load. +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %soffset.base, 4064 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +; Need a waterfall loop, but the offset is scalar. +; Make sure the maximum offset isn't exeeded when splitting this +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %soffset.base, 4068 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %soffset.base, 4096 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %offset.base, 5000 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %offset.base, 4076 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %soffset = add i32 %offset.base, 4080 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) { + ; CHECK-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: bb.2: + ; CHECK: successors: %bb.3, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 + ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load 16, align 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; CHECK: bb.3: + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: bb.4: + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK: $vgpr0 = COPY [[UV2]](s32) + ; CHECK: $vgpr1 = COPY [[UV3]](s32) + ; CHECK: $vgpr2 = COPY [[UV4]](s32) + ; CHECK: $vgpr3 = COPY [[UV5]](s32) + ; CHECK: $vgpr4 = COPY [[UV6]](s32) + ; CHECK: $vgpr5 = COPY [[UV7]](s32) + ; CHECK: $vgpr6 = COPY [[UV8]](s32) + ; CHECK: $vgpr7 = COPY [[UV9]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0) + ret <8 x float> %val +} + +define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { + ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %offset = add i32 %offset.v, %offset.s + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + ret float %val +} + +define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { + ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %offset = add i32 %offset.s, %offset.v + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + ret float %val +} + +define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { + ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %offset.base = add i32 %offset.v, %offset.s + %offset = add i32 %offset.base, 1024 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + ret float %val +} + +define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { + ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %offset.base = add i32 %offset.s, %offset.v + %offset = add i32 %offset.base, 1024 + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + ret float %val +} + +; TODO: Ideally this would be reassociated to fold. +define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { + ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %offset.base = add i32 %offset.s, 1024 + %offset = add i32 %offset.base, %offset.v + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + ret float %val +} + +define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) { + ; CHECK-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %offset.base = add i32 %offset.v, 1024 + %offset = add i32 %offset.base, %offset.s + %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + ret float %val +} + +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) +declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) +declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg) +declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg) + +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg) +declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg) + +declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg) +declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg) +declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg) + +declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg) +declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg) + +declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg) +declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg) + +declare <4 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v4p1i8(<4 x i32>, i32, i32 immarg) +declare <8 x i8 addrspace(1)*> @llvm.amdgcn.s.buffer.load.v8p1i8(<4 x i32>, i32, i32 immarg) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -0,0 +1,48 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=regbankselect -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=regbankselect -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s + +# We see the offset is a VGPR, but this is due to a constant for some +# reason ending up in a VGPR. This shouldn't really ever happen, but +# make sure this doesn't break when looking through copies for the add +# operands. + +--- +name: s_buffer_load_f32_vgpr_offset_cross_bank_copy_add_offset +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr0 + + ; FAST-LABEL: name: s_buffer_load_f32_vgpr_offset_cross_bank_copy_add_offset + ; FAST: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr0 + ; FAST: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 256 + ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY3]], [[COPY2]] + ; FAST: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[COPY3]], [[C1]], 256, 0, 0 :: (dereferenceable invariant load 4) + ; FAST: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) + ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_cross_bank_copy_add_offset + ; GREEDY: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr0 + ; GREEDY: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 256 + ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY2]], [[C]] + ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[COPY2]], [[C1]], 256, 0, 0 :: (dereferenceable invariant load 4) + ; GREEDY: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) + %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:_(s32) = COPY $sgpr0 + %2:vgpr(s32) = G_CONSTANT i32 256 + %3:_(s32) = G_ADD %1, %2 + %4:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), %0, %3, 0 + S_ENDPGM 0, implicit %4 + +...