diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1014,6 +1014,17 @@ [IntrNoMem, IntrWillReturn, ImmArg>]>, AMDGPURsrcIntrinsic<0>; +// Generate intrinsic similar to int_amdgcn_s_buffer_load, but with immediate +// instruction offset. +def int_amdgcn_s_buffer_load_imm : Intrinsic < + [llvm_any_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // byte offset + llvm_i32_ty, // unsigned imm offset (imm) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) + [IntrNoMem, IntrWillReturn, ImmArg>, ImmArg>]>, + AMDGPURsrcIntrinsic<0>; + class AMDGPUBufferStore : Intrinsic < [], [data_ty, // vdata(VGPR) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1118,6 +1118,7 @@ OffsetIdx = 1; break; case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_s_buffer_load_imm: // If resulting type is vec3, there is no point in trimming the // load with updated offset, as the vec3 would most likely be widened to // vec4 anyway during lowering. @@ -1232,6 +1233,7 @@ case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_s_buffer_load_imm: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5212,12 +5212,34 @@ B.setInsertPt(B.getMBB(), MI); } + auto Intr = MI.getIntrinsicID(); + // FIXME: We don't really need this intermediate instruction. The intrinsic // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); MI.removeOperand(1); // Remove intrinsic ID + if (Intr == Intrinsic::amdgcn_s_buffer_load) { + // Add instr offset 0 before cachepolicy. + unsigned CachePolicy = MI.getOperand(3).getImm(); + MI.addOperand(MachineOperand::CreateImm(CachePolicy)); + MI.getOperand(3).setImm(0); + } else { + // Fall-back to old scheme with instr offset 0. + // TODO: Proper support for instr offset. + assert(Intr == Intrinsic::amdgcn_s_buffer_load_imm); + + const LLT S32 = LLT::scalar(32); + unsigned InstrOffsetUint = MI.getOperand(3).getImm(); + if (InstrOffsetUint > 0) { + auto InstrOffset = B.buildConstant(S32, InstrOffsetUint).getReg(0); + auto NewSOffset = B.buildAdd(S32, MI.getOperand(2), InstrOffset); + MI.getOperand(2).setReg(NewSOffset.getReg(0)); + MI.getOperand(3).setImm(0); + } + } + // FIXME: When intrinsic definition is fixed, this should have an MMO already. // TODO: Should this use datalayout alignment? const unsigned MemSize = (Size + 7) / 8; @@ -5697,6 +5719,7 @@ return true; } case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_s_buffer_load_imm: return legalizeSBufferLoad(Helper, MI); case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -375,7 +375,8 @@ const MachineInstr &MI, const MachineRegisterInfo &MRI) const { switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_s_buffer_load: { + case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_s_buffer_load_imm: { static const OpRegBankEntry<2> Table[4] = { // Perfectly legal. { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -73,7 +73,8 @@ SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG, bool WithChain) const; SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue CachePolicy, SelectionDAG &DAG) const; + SDValue ImmOffset, SDValue CachePolicy, + SelectionDAG &DAG) const; SDValue lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const; @@ -244,11 +245,12 @@ bool shouldExpandVectorDynExt(SDNode *N) const; private: - // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the - // three offsets (voffset, soffset and instoffset) into the SDValue[3] array - // pointed to by Offsets. - void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, Align Alignment = Align(4)) const; + // Analyze a combined offset from an amdgcn_buffer_ intrinsic, together with + // immediate instruction offset, and store the three offsets (voffset, soffset + // and instoffset) into the SDValue[3] array pointed to by Offsets. + void setBufferOffsets(SDValue CombinedOffset, unsigned InstImmOffset, + SelectionDAG &DAG, SDValue *Offsets, + Align Alignment = Align(4)) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6676,7 +6676,8 @@ } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue CachePolicy, + SDValue Offset, SDValue ImmOffset, + SDValue CachePolicy, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -6691,11 +6692,32 @@ VT.getStoreSize(), Alignment); if (!Offset->isDivergent()) { - SDValue Ops[] = { - Rsrc, - Offset, // Offset - CachePolicy - }; + + SDValue Ops[4]; + Ops[0] = Rsrc; + + // In the unlikely event the immediate offset overflows 20 bit we add the + // overflow to soffset. + // The encoding has 21 bits, but using a negative offset is unsafe, so + // we only make use of 20 bits and treat the offset as unsigned. + unsigned ImmOffsetUint = cast(ImmOffset)->getZExtValue(); + unsigned MaxOffset = (1 << 20) - 1; + if (ImmOffsetUint > MaxOffset) { + unsigned High = ImmOffsetUint & ~MaxOffset; + unsigned Low = ImmOffsetUint & MaxOffset; + + SDValue Overflow = DAG.getConstant(High, DL, MVT::i32); + SDValue CombinedOffset = + DAG.getNode(ISD::ADD, DL, MVT::i32, Offset, Overflow); + SDValue ValidImmOffset = DAG.getTargetConstant(Low, DL, MVT::i32); + + Ops[1] = CombinedOffset; + Ops[2] = ValidImmOffset; + } else { + Ops[1] = Offset; + Ops[2] = ImmOffset; + } + Ops[3] = CachePolicy; // Widen vec3 load to vec4. if (VT.isVector() && VT.getVectorNumElements() == 3) { @@ -6741,7 +6763,8 @@ // Use the alignment to ensure that the required offsets will fit into the // immediate offsets. - setBufferOffsets(Offset, DAG, &Ops[3], + setBufferOffsets(Offset, (cast(ImmOffset))->getZExtValue(), + DAG, &Ops[3], NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); uint64_t InstOffset = cast(Ops[5])->getZExtValue(); @@ -6943,12 +6966,22 @@ case Intrinsic::amdgcn_wavefrontsize: return DAG.getConstant(MF.getSubtarget().getWavefrontSize(), SDLoc(Op), MVT::i32); - case Intrinsic::amdgcn_s_buffer_load: { - unsigned CPol = cast(Op.getOperand(3))->getZExtValue(); - if (CPol & ~AMDGPU::CPol::ALL) + case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_s_buffer_load_imm: { + SDValue ImmOffset, CPol; + if (IntrinsicID == Intrinsic::amdgcn_s_buffer_load_imm) { + ImmOffset = Op.getOperand(3); + CPol = Op.getOperand(4); + } else { + ImmOffset = DAG.getTargetConstant(0, DL, MVT::i32); + CPol = Op.getOperand(3); + } + + if (cast(CPol)->getZExtValue() & ~AMDGPU::CPol::ALL) return Op; - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - DAG); + + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), ImmOffset, + CPol, DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -7325,7 +7358,7 @@ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + setBufferOffsets(Op.getOperand(4), 0, DAG, &Ops[3]); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; @@ -7489,7 +7522,7 @@ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + setBufferOffsets(Op.getOperand(5), 0, DAG, &Ops[4]); EVT VT = Op.getValueType(); @@ -7622,7 +7655,7 @@ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + setBufferOffsets(Op.getOperand(6), 0, DAG, &Ops[5]); EVT VT = Op.getValueType(); auto *M = cast(Op); @@ -8102,7 +8135,7 @@ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + setBufferOffsets(Op.getOperand(5), 0, DAG, &Ops[4]); unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; @@ -8458,14 +8491,15 @@ // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, + uint32_t InstImmOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast(CombinedOffset)) { - uint32_t Imm = C->getZExtValue(); + uint64_t Imm = C->getZExtValue() + InstImmOffset; uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, - Alignment)) { + if (Imm <= UINT_MAX && AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, + Subtarget, Alignment)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -8476,15 +8510,33 @@ SDValue N0 = CombinedOffset.getOperand(0); SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; - int Offset = cast(N1)->getSExtValue(); - if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Alignment)) { + int64_t Imm = + cast(N1)->getSExtValue() + (int64_t)InstImmOffset; + if (Imm >= 0 && Imm <= UINT_MAX && + AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, + Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); return; } } + if (InstImmOffset) { + // The extra imm offset in buffer offsets is only supported for an intrinsic + // available in gfx9+. This also means that the later call to + // splitMUBUFOffset will always succeed. + assert(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9); + uint32_t SOffset, ImmOffset; + uint32_t Imm = InstImmOffset; + bool Res = + AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Alignment); + (void)Res; + assert(Res && "Unexpected overflow from InstImmOffset."); + Offsets[0] = CombinedOffset; + Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return; + } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -41,7 +41,7 @@ def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", - SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>, [SDNPMayLoad, SDNPMemOperand] >; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3386,7 +3386,7 @@ // really needs a memory operand. def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); + let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$imm_offset, untyped_imm_0:$cachepolicy); let hasSideEffects = 0; let mayLoad = 1; let mayStore = 0; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -874,14 +874,14 @@ multiclass SMLoad_Pattern { // 1. Offset as an immediate def : GCNPat < - (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy), + (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), 0, timm:$cachepolicy), (vt (!cast(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> { let AddedComplexity = 2; } // 2. 32-bit IMM offset on CI def : GCNPat < - (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)), + (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), 0, timm:$cachepolicy)), (!cast(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset, (extract_cpol $cachepolicy))> { let OtherPredicates = [isGFX7Only]; @@ -890,18 +890,27 @@ // 3. Offset loaded in an 32bit SGPR def : GCNPat < - (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy), + (SIsbuffer_load v4i32:$sbase, i32:$soffset, 0, timm:$cachepolicy), (vt (!cast(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy))) >; // 4. Offset as an 32-bit SGPR + immediate def : GCNPat < - (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgprImm i32:$soffset, i32:$offset), + (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgprImm i32:$soffset, i32:$offset), 0, timm:$cachepolicy), (vt (!cast(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset, (extract_cpol $cachepolicy)))> { let OtherPredicates = [isGFX9Plus]; } + + // 5. Offset as a 32-bit SGPR and separate immediate instruction offset. + def : GCNPat < + (SIsbuffer_load v4i32:$sbase, i32:$soffset, i32:$offset, + timm:$cachepolicy), + (vt (!cast(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, smrd_literal_offset:$offset, + (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isGFX9Plus]; + } } // Global and constant loads can be selected to either MUBUF or SMRD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir @@ -13,7 +13,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s32)) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s32)) ; GCN-NEXT: S_ENDPGM 0, implicit [[AMDGPU_S_BUFFER_LOAD]](s32) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = G_CONSTANT i32 0 @@ -33,7 +33,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) @@ -55,7 +55,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x p3>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) @@ -78,7 +78,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) @@ -101,7 +101,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s192), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s192), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32) ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<6 x s32>) @@ -123,7 +123,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s64>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s192), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s64>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s192), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s64>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64) ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s64>) @@ -145,7 +145,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) @@ -211,7 +211,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GCN-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32) ; GCN-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -15,10 +15,10 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[COPY1]](s32), 0 + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[COPY1]](s32), 0, 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0, 0 ... @@ -40,7 +40,7 @@ ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C1]](s32), [[COPY1]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0, 0 ... @@ -97,7 +97,7 @@ ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0, 0 ... @@ -153,6 +153,6 @@ ; CHECK-NEXT: .4: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 - %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0 + %2:_(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD %0, %1, 0, 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32) @@ -29,7 +29,7 @@ ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) ; GREEDY-NEXT: $sgpr0 = COPY [[INT]](s32) @@ -49,7 +49,7 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s64), align 4) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -68,7 +68,7 @@ ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s64), align 4) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -92,7 +92,7 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -114,7 +114,7 @@ ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s96), align 4) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -141,7 +141,7 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s256), align 4) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -178,7 +178,7 @@ ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s256), align 4) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -220,7 +220,7 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s512), align 4) ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) @@ -281,7 +281,7 @@ ; GREEDY-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4) + ; GREEDY-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0, 0 :: (dereferenceable invariant load (s512), align 4) ; GREEDY-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) ; GREEDY-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -44,7 +44,7 @@ %1:_(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CONSTANT i32 256 %3:_(s32) = G_ADD %1, %2 - %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 + %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0, 0 S_ENDPGM 0, implicit %4 ... @@ -85,7 +85,7 @@ %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 -60 %3:_(s32) = G_ADD %1, %2 - %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0 + %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0, %3, 0, 0 S_ENDPGM 0, implicit %4 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir @@ -786,7 +786,7 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s32>) = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[COPY]](<4 x s32>), [[C]](s32), 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s32>) = COPY [[COPY1]](<2 x s32>) ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 @@ -803,7 +803,7 @@ %1:_(<2 x s32>) = COPY $sgpr4_sgpr5 %2:_(s32) = COPY $vgpr0 %3:_(s32) = G_CONSTANT i32 0 - %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0(<4 x s32>), %3(s32), 0 :: (dereferenceable invariant load (s32)) + %4:_(s32) = G_AMDGPU_S_BUFFER_LOAD %0(<4 x s32>), %3(s32), 0, 0 :: (dereferenceable invariant load (s32)) %5:_(<2 x s32>) = G_INSERT_VECTOR_ELT %1, %4(s32), %2(s32) S_ENDPGM 0, implicit %5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.imm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.imm.ll @@ -0,0 +1,454 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GCN + +; The tests check code generated for @llvm.amdgcn.s.buffer.load.imm. +; The first group of tests is similar to tests @llvm.amdgcn.s.buffer.load.i32 with instruction immediate offset being 0. +; The second group of tests checks cases where instruction immediate offset is not equal to 0. + +;GCN-LABEL: {{^}}s_buffer_load_imm: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 4, i32 0, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_load_index: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_load_index_divergent: +;GCN-NOT: s_waitcnt; +;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32 %index) { +main_body: + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx2_imm: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 +define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.imm.v2i32(<4 x i32> %desc, i32 64, i32 0, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx2_index: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.imm.v2i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx2_index_divergent: +;GCN-NOT: s_waitcnt; +;GCN: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i32 %index) { +main_body: + %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.imm.v2i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast <2 x i32> %load to <2 x float> + %x = extractelement <2 x float> %bitcast, i32 0 + %y = extractelement <2 x float> %bitcast, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx3_imm: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 +define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.imm.v3i32(<4 x i32> %desc, i32 64, i32 0, i32 0) + %bitcast = bitcast <3 x i32> %load to <3 x float> + %x = extractelement <3 x float> %bitcast, i32 0 + %y = extractelement <3 x float> %bitcast, i32 1 + %z = extractelement <3 x float> %bitcast, i32 2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx3_index: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.imm.v3i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast <3 x i32> %load to <3 x float> + %x = extractelement <3 x float> %bitcast, i32 0 + %y = extractelement <3 x float> %bitcast, i32 1 + %z = extractelement <3 x float> %bitcast, i32 2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx3_index_divergent: +;GCN-NOT: s_waitcnt; +;GCN: buffer_load_dwordx3 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i32 %index) { +main_body: + %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.imm.v3i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast <3 x i32> %load to <3 x float> + %x = extractelement <3 x float> %bitcast, i32 0 + %y = extractelement <3 x float> %bitcast, i32 1 + %z = extractelement <3 x float> %bitcast, i32 2 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx4_imm: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8 +define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.imm.v4i32(<4 x i32> %desc, i32 200, i32 0, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx4_index: +;GCN-NOT: s_waitcnt; +;GCN: buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.imm.v4i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_loadx4_index_divergent: +;GCN-NOT: s_waitcnt; +;GCN: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i32 %index) { +main_body: + %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.imm.v4i32(<4 x i32> %desc, i32 %index, i32 0, i32 0) + %bitcast = bitcast <4 x i32> %load to <4 x float> + %x = extractelement <4 x float> %bitcast, i32 0 + %y = extractelement <4 x float> %bitcast, i32 1 + %z = extractelement <4 x float> %bitcast, i32 2 + %w = extractelement <4 x float> %bitcast, i32 3 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_load_imm_mergex2: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4 +define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 4, i32 0, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 8, i32 0, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_load_imm_mergex4: +;GCN-NOT: s_waitcnt; +;GCN: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8 +define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { +main_body: + %load0 = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 8, i32 0, i32 0) + %load1 = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 12, i32 0, i32 0) + %load2 = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 16, i32 0, i32 0) + %load3 = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 20, i32 0, i32 0) + %x = bitcast i32 %load0 to float + %y = bitcast i32 %load1 to float + %z = bitcast i32 %load2 to float + %w = bitcast i32 %load3 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_load_index_across_bb: +;GCN-NOT: s_waitcnt; +;GCN: v_or_b32 +;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) { +main_body: + %tmp = shl i32 %index, 4 + br label %bb1 + +bb1: ; preds = %main_body + %tmp1 = or i32 %tmp, 8 + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %tmp1, i32 0, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;GCN-LABEL: {{^}}s_buffer_load_index_across_bb_merged: +;GCN-NOT: s_waitcnt; +;GCN: v_or_b32 +;GCN: v_or_b32 +;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) { +main_body: + %tmp = shl i32 %index, 4 + br label %bb1 + +bb1: ; preds = %main_body + %tmp1 = or i32 %tmp, 8 + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %tmp1, i32 0, i32 0) + %tmp2 = or i32 %tmp1, 4 + %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %tmp2, i32 0, i32 0) + %bitcast = bitcast i32 %load to float + %bitcast2 = bitcast i32 %load2 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_neg1: +; GCN: s_mov_b32 [[K:s[0-9]+]], -1{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_neg1(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 -1, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_neg4: +; GCN: s_mov_b32 [[K:s[0-9]+]], -4{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_neg4(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 -4, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_neg8: +; GCN: s_mov_b32 [[K:s[0-9]+]], -8{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_neg8(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 -8, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_bit31: +; GCN: s_brev_b32 [[K:s[0-9]+]], 1{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_bit31(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 -2147483648, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_bit30: +; GCN: s_mov_b32 [[K:s[0-9]+]], 2.0{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_bit30(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1073741824, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_bit29: +; GCN: s_brev_b32 [[K:s[0-9]+]], 4{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_bit29(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 536870912, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_bit21: +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x200000{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_bit21(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 2097152, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_bit20: +; GCN: s_mov_b32 [[K:s[0-9]+]], 0x100000{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_bit20(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1048576, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_neg_bit20: +; GCN: s_mov_b32 [[K:s[0-9]+]], 0xfff00000{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_neg_bit20(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 -1048576, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_bit19: +; GCN: s_buffer_load_dword s0, s[0:3], 0x80000{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_bit19(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 524288, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_neg_bit19: +; GCN: s_mov_b32 [[K:s[0-9]+]], 0xfff80000{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], [[K]]{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_neg_bit19(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 -524288, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_255: +; GCN: s_buffer_load_dword s0, s[0:3], 0xff{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_255(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 255, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_256: +; GCN: s_buffer_load_dword s0, s[0:3], 0x100{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_256(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 256, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_1016: +; GCN: s_buffer_load_dword s0, s[0:3], 0x3f8{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_1016(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1016, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_1020: +; GCN: s_buffer_load_dword s0, s[0:3], 0x3fc{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_1020(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1020, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_1021: +; GCN: s_buffer_load_dword s0, s[0:3], 0x3fd{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_1021(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1021, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_1024: +; GCN: s_buffer_load_dword s0, s[0:3], 0x400{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_1024(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1024, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_imm_1025: +; GCN: s_buffer_load_dword s0, s[0:3], 0x401{{$}} +define amdgpu_ps i32 @s_buffer_load_imm_1025(<4 x i32> inreg %desc) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 1025, i32 0, i32 0) + ret i32 %load +} + +; GCN-LABEL: {{^}}s_buffer_load_index_imm_1024: +; GCN: s_buffer_load_dword s0, s[0:3], s4 offset:0x400{{$}} +define amdgpu_ps void @s_buffer_load_index_imm_1024(<4 x i32> inreg %desc, i32 inreg %index) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 1024, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_index_imm_0xffffc: +; GCN: s_buffer_load_dword s0, s[0:3], s4 offset:0xffffc{{$}} +define amdgpu_ps void @s_buffer_load_index_imm_0xffffc(<4 x i32> inreg %desc, i32 inreg %index) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 1048572, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_index_imm_overflow_0x100000: +; GCN: s_add_i32 s4, s4, 0x100000{{$}} +; GCN: s_buffer_load_dword s0, s[0:3], s4{{$}} +define amdgpu_ps void @s_buffer_load_index_imm_overflow_0x100000(<4 x i32> inreg %desc, i32 inreg %index) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 1048576, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_index_imm_overflow_0x100004: +; GCN: s_buffer_load_dword s0, s[0:3], s4 offset:0x4{{$}} +define amdgpu_ps void @s_buffer_load_index_imm_overflow_0x100004(<4 x i32> inreg %desc, i32 inreg %index) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 1048580, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_divergent_index_imm_1024: +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen offset:1024{{$}} +define amdgpu_ps void @s_buffer_load_divergent_index_imm_1024(<4 x i32> inreg %desc, i32 %index) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 1024, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_divergent_index_imm_5000: +; GCN: s_movk_i32 s4, 0xffc +; GCN: buffer_load_dword v0, v0, s[0:3], s4 offen offset:908{{$}} +define amdgpu_ps void @s_buffer_load_divergent_index_imm_5000(<4 x i32> inreg %desc, i32 %index) { + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %index, i32 5000, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_divergent_index_add_imm_1024: +; GCN: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1028{{$}} +define amdgpu_ps void @s_buffer_load_divergent_index_add_imm_1024(<4 x i32> inreg %desc, i32 %index) { + %add = add i32 %index, 4 + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %add, i32 1024, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_divergent_index_add_imm_5000: +; GCN: s_movk_i32 s4, 0xffc +; GCN: buffer_load_dword v0, v0, s[0:3], s4 offen offset:912{{$}} +define amdgpu_ps void @s_buffer_load_divergent_index_add_imm_5000(<4 x i32> inreg %desc, i32 %index) { + %add = add i32 %index, 4 + %load = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> %desc, i32 %add, i32 5000, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) +declare i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32>, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.s.buffer.load.imm.v2i32(<4 x i32>, i32, i32, i32) +declare <3 x i32> @llvm.amdgcn.s.buffer.load.imm.v3i32(<4 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.imm.v4i32(<4 x i32>, i32, i32, i32) + diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -1212,6 +1212,400 @@ declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) #1 declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1 +; -------------------------------------------------------------------- +; llvm.amdgcn.s.buffer.load.imm +; -------------------------------------------------------------------- + +define amdgpu_ps float @s_buffer_load_imm_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_imm_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + ret float %data +} + +define amdgpu_ps <2 x float> @s_buffer_load_imm_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_imm_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + ret <2 x float> %data +} + +define amdgpu_ps <4 x float> @s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + ret <4 x float> %data +} + +define amdgpu_ps float @extract_elt0_s_buffer_load_imm_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_imm_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +define amdgpu_ps float @extract_elt1_s_buffer_load_imm_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <2 x float> %data, i32 1 + ret float %elt1 +} + +define amdgpu_ps float @extract_elt0_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt0 = extractelement <4 x float> %data, i32 0 + ret float %elt0 +} + +define amdgpu_ps float @extract_elt1_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x float> %data, i32 1 + ret float %elt1 +} + +define amdgpu_ps float @extract_elt2_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x float> %data, i32 2 + ret float %elt1 +} + +define amdgpu_ps float @extract_elt3_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x float> %data, i32 3 + ret float %elt1 +} + +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> + ret <2 x float> %shuf +} + +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> + ret <2 x float> %shuf +} + +define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> + ret <2 x float> %shuf +} + +define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> + ret <3 x float> %shuf +} + +define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> + ret <3 x float> %shuf +} + +define amdgpu_ps float @extract_elt0_s_buffer_load_imm_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_imm_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt0 = extractelement <3 x float> %data, i32 0 + ret float %elt0 +} + +define amdgpu_ps float @extract_elt1_s_buffer_load_imm_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <3 x float> %data, i32 1 + ret float %elt1 +} + +define amdgpu_ps float @extract_elt2_s_buffer_load_imm_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_s_buffer_load_imm_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <3 x float> %data, i32 2 + ret float %elt1 +} + +define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_imm_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_imm_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> + ret <2 x float> %shuf +} + +define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_imm_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_imm_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; + %data = call <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> + ret <2 x float> %shuf +} + +; Do not trim to vec3 s_buffer_load_imm in instcombine, as the load will most likely be widened +; to vec4 anyway during lowering. +define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; + %data = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> + ret <3 x float> %shuf +} + +define i32 @extract0_bitcast_s_buffer_load_imm_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_imm_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 +} + +define float @extract0_bitcast_s_buffer_load_imm_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_imm_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.imm.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.imm.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 +} + +define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_imm_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_imm_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; + %data = call <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0), !fpmath !0 + %elt0 = extractelement <2 x float> %data, i32 0 + ret float %elt0 +} + +declare float @llvm.amdgcn.s.buffer.load.imm.f32(<4 x i32>, i32, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.s.buffer.load.imm.v2f32(<4 x i32>, i32, i32, i32) #1 +declare <3 x float> @llvm.amdgcn.s.buffer.load.imm.v3f32(<4 x i32>, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.s.buffer.load.imm.v4f32(<4 x i32>, i32, i32, i32) #1 +declare <4 x i32> @llvm.amdgcn.s.buffer.load.imm.v4i32(<4 x i32>, i32, i32, i32) #1 + +define amdgpu_ps half @extract_elt0_s_buffer_load_imm_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_imm_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.imm.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.imm.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt0 = extractelement <2 x half> %data, i32 0 + ret half %elt0 +} + +define amdgpu_ps half @extract_elt1_s_buffer_load_imm_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.imm.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; + %data = call <2 x half> @llvm.amdgcn.s.buffer.load.imm.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <2 x half> %data, i32 1 + ret half %elt1 +} + +define amdgpu_ps half @extract_elt1_s_buffer_load_imm_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.imm.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; + %data = call <3 x half> @llvm.amdgcn.s.buffer.load.imm.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <3 x half> %data, i32 1 + ret half %elt1 +} + +define amdgpu_ps half @extract_elt1_s_buffer_load_imm_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.imm.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.imm.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x half> %data, i32 1 + ret half %elt1 +} + + +define amdgpu_ps half @extract_elt3_s_buffer_load_imm_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_imm_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.imm.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.imm.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x half> %data, i32 3 + ret half %elt1 +} + +define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_imm_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_imm_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.imm.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; + %data = call <4 x half> @llvm.amdgcn.s.buffer.load.imm.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> + ret <2 x half> %shuf +} + +declare half @llvm.amdgcn.s.buffer.load.imm.f16(<4 x i32>, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.s.buffer.load.imm.v2f16(<4 x i32>, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.s.buffer.load.imm.v3f16(<4 x i32>, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.s.buffer.load.imm.v4f16(<4 x i32>, i32, i32, i32) #1 + +define amdgpu_ps i8 @extract_elt0_s_buffer_load_imm_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_imm_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.imm.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.imm.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt0 = extractelement <2 x i8> %data, i32 0 + ret i8 %elt0 +} + +define amdgpu_ps i8 @extract_elt1_s_buffer_load_imm_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.imm.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; + %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.imm.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <2 x i8> %data, i32 1 + ret i8 %elt1 +} + +define amdgpu_ps i8 @extract_elt1_s_buffer_load_imm_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.imm.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; + %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.imm.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <3 x i8> %data, i32 1 + ret i8 %elt1 +} + +define amdgpu_ps i8 @extract_elt1_s_buffer_load_imm_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_imm_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.imm.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.imm.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 1 + ret i8 %elt1 +} + +define amdgpu_ps i8 @extract_elt3_s_buffer_load_imm_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_imm_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.imm.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 4, i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.imm.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %elt1 = extractelement <4 x i8> %data, i32 3 + ret i8 %elt1 +} + +define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_imm_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_imm_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.imm.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 4, i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; + %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.imm.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 4, i32 0) + %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> + ret <2 x i8> %shuf +} + +declare i8 @llvm.amdgcn.s.buffer.load.imm.i8(<4 x i32>, i32, i32, i32) #1 +declare <2 x i8> @llvm.amdgcn.s.buffer.load.imm.v2i8(<4 x i32>, i32, i32, i32) #1 +declare <3 x i8> @llvm.amdgcn.s.buffer.load.imm.v3i8(<4 x i32>, i32, i32, i32) #1 +declare <4 x i8> @llvm.amdgcn.s.buffer.load.imm.v4i8(<4 x i32>, i32, i32, i32) #1 + + ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load.format ; --------------------------------------------------------------------