diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -861,6 +861,13 @@ [IntrReadMem], [SDNPMemOperand]>; } + foreach dim = AMDGPUDims.Msaa in { + def int_amdgcn_image_msaa_load # _ # dim.Name: + AMDGPUImageDimIntrinsic< + AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, + [IntrReadMem], [SDNPMemOperand]>; + } + ////////////////////////////////////////////////////////////////////////// // sample and getlod intrinsics ////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1537,6 +1537,7 @@ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; @@ -1653,7 +1654,12 @@ ++NumVDataDwords; int Opcode = -1; - if (IsGFX10Plus) { + if (IsGFX11Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx11Default, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX10Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx10NSA : AMDGPU::MIMGEncGfx10Default, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4963,6 +4963,10 @@ // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. + // + // TODO: we can actually allow partial NSA where the final register is a + // contiguous set of the remaining addresses. + // This could help where there are more addresses than supported. const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && CorrectedNumVAddrs <= ST.getNSAMaxSize(); @@ -5343,6 +5347,8 @@ MachineRegisterInfo &MRI = *B.getMRI(); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); + const LLT V2S16 = LLT::fixed_vector(2, 16); + const LLT V3S32 = LLT::fixed_vector(3, 32); Register DstReg = MI.getOperand(0).getReg(); Register NodePtr = MI.getOperand(2).getReg(); @@ -5360,61 +5366,98 @@ return false; } + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = - ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); + const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; + const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; int Opcode; if (UseNSA) { - Opcode = - AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, - NumVDataDwords, NumVAddrDwords); - } else { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - AMDGPU::MIMGEncGfx10Default, NumVDataDwords, - PowerOf2Ceil(NumVAddrDwords)); + IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx10NSA, + NumVDataDwords, NumVAddrDwords); + } else { + Opcode = AMDGPU::getMIMGOpcode( + BaseOpcodes[Is64][IsA16], + IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); } assert(Opcode != -1); SmallVector Ops; - if (Is64) { - auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); - Ops.push_back(Unmerge.getReg(0)); - Ops.push_back(Unmerge.getReg(1)); - } else { + if (UseNSA && IsGFX11Plus) { + auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); + auto Merged = B.buildMerge( + V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); + Ops.push_back(Merged.getReg(0)); + }; + Ops.push_back(NodePtr); - } - Ops.push_back(RayExtent); + Ops.push_back(RayExtent); + packLanes(RayOrigin); + + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); + auto MergedDir = B.buildMerge( + V3S32, + {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), + UnmergeRayDir.getReg(0)})) + .getReg(0), + B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), + UnmergeRayDir.getReg(1)})) + .getReg(0), + B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), + UnmergeRayDir.getReg(2)})) + .getReg(0)}); + Ops.push_back(MergedDir.getReg(0)); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } + } else { + if (Is64) { + auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + } else { + Ops.push_back(NodePtr); + } + Ops.push_back(RayExtent); - auto packLanes = [&Ops, &S32, &B](Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); - Ops.push_back(Unmerge.getReg(0)); - Ops.push_back(Unmerge.getReg(1)); - Ops.push_back(Unmerge.getReg(2)); - }; + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + Ops.push_back(Unmerge.getReg(2)); + }; - packLanes(RayOrigin); - if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); - Register R1 = MRI.createGenericVirtualRegister(S32); - Register R2 = MRI.createGenericVirtualRegister(S32); - Register R3 = MRI.createGenericVirtualRegister(S32); - B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); - B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); - B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); - Ops.push_back(R1); - Ops.push_back(R2); - Ops.push_back(R3); - } else { - packLanes(RayDir); - packLanes(RayInvDir); + packLanes(RayOrigin); + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); + Register R1 = MRI.createGenericVirtualRegister(S32); + Register R2 = MRI.createGenericVirtualRegister(S32); + Register R3 = MRI.createGenericVirtualRegister(S32); + B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); + B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); + B.buildMerge(R3, + {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); + Ops.push_back(R1); + Ops.push_back(R2); + Ops.push_back(R3); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } } if (!UseNSA) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4491,8 +4491,10 @@ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } else { // NSA form - for (unsigned I = 2; I < N; ++I) - OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + for (unsigned I = 2; I < N; ++I) { + unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits(); + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + } } break; } diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" @@ -160,15 +161,23 @@ GCNNSAReassign::NSA_Status GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info) return NSA_Status::NOT_NSA; + switch (Info->MIMGEncoding) { + case AMDGPU::MIMGEncGfx10NSA: + case AMDGPU::MIMGEncGfx11NSA: + break; + default: + return NSA_Status::NOT_NSA; + } + int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); unsigned VgprBase = 0; bool NSA = false; - for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + for (unsigned I = 0; I < Info->VAddrOperands; ++I) { const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); Register Reg = Op.getReg(); if (Reg.isPhysical() || !VRM->isAssignedReg(Reg)) @@ -180,6 +189,7 @@ if (!PhysReg) return NSA_Status::FIXED; + // TODO: address the below limitation to handle GFX11 BVH instructions // Bail if address is not a VGPR32. That should be possible to extend the // optimization to work with subregs of a wider register tuples, but the // logic to find free registers will be much more complicated with much @@ -188,7 +198,7 @@ // parts of an address and it is either already consecutive or cannot // be reassigned if not. If needed it is better to rely on register // coalescer to process such address tuples. - if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) + if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg()) return NSA_Status::FIXED; // InlineSpiller does not call LRM::assign() after an LI split leaving @@ -279,7 +289,7 @@ SmallVector Intervals; SmallVector OrigRegs; SlotIndex MinInd, MaxInd; - for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + for (unsigned I = 0; I < Info->VAddrOperands; ++I) { const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); Register Reg = Op.getReg(); LiveInterval *LI = &LIS->getInterval(Reg); @@ -332,11 +342,11 @@ } if (!Success) { - for (unsigned I = 0; I < Info->VAddrDwords; ++I) + for (unsigned I = 0; I < Info->VAddrOperands; ++I) if (VRM->hasPhys(Intervals[I]->reg())) LRM->unassign(*Intervals[I]); - for (unsigned I = 0; I < Info->VAddrDwords; ++I) + for (unsigned I = 0; I < Info->VAddrOperands; ++I) LRM->assign(*Intervals[I], OrigRegs[I]); continue; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6212,6 +6212,7 @@ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); + bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); SmallVector ResultTypes(Op->values()); SmallVector OrigResultTypes(Op->values()); @@ -6390,6 +6391,10 @@ // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. + // + // TODO: we can actually allow partial NSA where the final register is a + // contiguous set of the remaining addresses. + // This could help where there are more addresses than supported. bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3 && VAddrs.size() <= (unsigned)ST->getNSAMaxSize(); @@ -6496,7 +6501,12 @@ UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (IsGFX10Plus) { + if (IsGFX11Plus) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx11Default, + NumVDataDwords, NumVAddrDwords); + } else if (IsGFX10Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx10NSA : AMDGPU::MIMGEncGfx10Default, @@ -7555,12 +7565,14 @@ return SDValue(); } + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; const bool Is64 = NodePtr.getValueType() == MVT::i64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = Subtarget->hasNSAEncoding() && - NumVAddrDwords <= Subtarget->getNSAMaxSize(); + const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; + const bool UseNSA = + Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize(); const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, @@ -7568,12 +7580,15 @@ int Opcode; if (UseNSA) { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - AMDGPU::MIMGEncGfx10NSA, NumVDataDwords, - NumVAddrDwords); + IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx10NSA, + NumVDataDwords, NumVAddrDwords); } else { - Opcode = AMDGPU::getMIMGOpcode( - BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords, - PowerOf2Ceil(NumVAddrDwords)); + Opcode = + AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default + : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); } assert(Opcode != -1); @@ -7606,15 +7621,36 @@ } }; - if (Is64) - DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2); - else + if (UseNSA && IsGFX11Plus) { Ops.push_back(NodePtr); + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + Ops.push_back(RayOrigin); + if (IsA16) { + SmallVector DirLanes, InvDirLanes, MergedLanes; + DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3); + DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3); + for (unsigned I = 0; I < 3; ++I) { + MergedLanes.push_back(DAG.getBitcast( + MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, + {DirLanes[I], InvDirLanes[I]}))); + } + Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes)); + } else { + Ops.push_back(RayDir); + Ops.push_back(RayInvDir); + } + } else { + if (Is64) + DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, + 2); + else + Ops.push_back(NodePtr); - Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); - packLanes(RayOrigin, true); - packLanes(RayDir, true); - packLanes(RayInvDir, false); + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + packLanes(RayOrigin, true); + packLanes(RayDir, true); + packLanes(RayInvDir, false); + } if (!UseNSA) { // Build a single vector containing all the operands so far prepared. diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -238,9 +238,21 @@ // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info) return; + uint8_t NewEncoding; + switch (Info->MIMGEncoding) { + case AMDGPU::MIMGEncGfx10NSA: + NewEncoding = AMDGPU::MIMGEncGfx10Default; + break; + case AMDGPU::MIMGEncGfx11NSA: + NewEncoding = AMDGPU::MIMGEncGfx11Default; + break; + default: + return; + } + int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); unsigned NewAddrDwords = Info->VAddrDwords; @@ -266,16 +278,23 @@ } unsigned VgprBase = 0; + unsigned NextVgpr = 0; bool IsUndef = true; bool IsKill = NewAddrDwords == Info->VAddrDwords; - for (unsigned i = 0; i < Info->VAddrDwords; ++i) { - const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); + for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); + unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; + assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); - if (i == 0) { + if (Idx == 0) { VgprBase = Vgpr; - } else if (VgprBase + i != Vgpr) + NextVgpr = Vgpr + Dwords; + } else if (Vgpr == NextVgpr) { + NextVgpr = Vgpr + Dwords; + } else { return; + } if (!Op.isUndef()) IsUndef = false; @@ -308,21 +327,20 @@ } } - unsigned NewOpcode = - AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, - Info->VDataDwords, NewAddrDwords); + unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, + Info->VDataDwords, NewAddrDwords); MI.setDesc(TII->get(NewOpcode)); MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); MI.getOperand(VAddr0Idx).setIsKill(IsKill); - for (unsigned i = 1; i < Info->VAddrDwords; ++i) + for (int i = 1; i < Info->VAddrOperands; ++i) MI.removeOperand(VAddr0Idx + 1); if (ToUntie >= 0) { MI.tieOperands( AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), - ToUntie - (Info->VAddrDwords - 1)); + ToUntie - (Info->VAddrOperands - 1)); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { ; GFX9-LABEL: name: sample_1d diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) { ; GFX9-LABEL: atomic_swap_i32_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll @@ -4,6 +4,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX900 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -o - %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps float @atomic_swap_i32_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { ; GFX6-LABEL: atomic_swap_i32_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i16 %mip) { ; GFX9-LABEL: getresinfo_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { ; GFX6-LABEL: getresinfo_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-PACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps half @load_1d_f16_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_f16_x: @@ -47,19 +48,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_f16_x: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_f16_x: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v } @@ -107,19 +108,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_f16_y: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_f16_y: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v } @@ -167,19 +168,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_f16_z: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_f16_z: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v } @@ -227,19 +228,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_f16_w: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_f16_w: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v } @@ -290,19 +291,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v2f16_xy: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v2f16_xy: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v } @@ -353,19 +354,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v2f16_xz: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v2f16_xz: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v } @@ -416,19 +417,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v2f16_xw: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v2f16_xw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v } @@ -479,19 +480,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v2f16_yz: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v2f16_yz: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v } @@ -572,6 +573,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_v3f16_xyz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX11-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v } @@ -625,19 +645,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v4f16_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v4f16_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x half> %v } @@ -694,22 +714,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_f16_tfe_dmask_x: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_f16_tfe_dmask_x: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v1 +; GFX10PLUS-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v2 +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { half, i32 } %v, 1 %vv = bitcast i32 %v.err to float @@ -769,22 +789,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v2f16_tfe_dmask_xy: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v2f16_tfe_dmask_xy: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v1 +; GFX10PLUS-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v2 +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <2 x half>, i32 } %v, 1 %vv = bitcast i32 %v.err to float @@ -847,23 +867,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v3f16_tfe_dmask_xyz: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v3f16_tfe_dmask_xyz: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v1 +; GFX10PLUS-NEXT: image_load v[1:3], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v3 +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <3 x half>, i32 } %v, 1 %vv = bitcast i32 %v.err to float @@ -922,22 +942,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v4f16_tfe_dmask_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v4f16_tfe_dmask_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v1 +; GFX10PLUS-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v2 +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue { <4 x half>, i32 } %v, 1 %vv = bitcast i32 %v.err to float diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GFX68 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefix=NOPRT %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps float @load_1d_f32_x(<8 x i32> inreg %rsrc, i32 %s) { ; GFX68-LABEL: load_1d_f32_x: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { ; GFX6-LABEL: load_2d_v4f32_xyzw: @@ -17,19 +18,19 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2d_v4f32_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2d_v4f32_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } @@ -89,6 +90,34 @@ ; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2d_v4f32_xyzw_tfe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 @@ -151,6 +180,34 @@ ; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2d_v4f32_xyzw_tfe_lwe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: @@ -22,23 +23,23 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2darraymsaa_v4f32_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10PLUS-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } @@ -104,6 +105,36 @@ ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 +; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v2, v7 +; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 @@ -172,6 +203,36 @@ ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 +; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v2, v7 +; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX6-LABEL: load_2darraymsaa_v4f32_xyzw: @@ -17,19 +18,19 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2darraymsaa_v4f32_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } @@ -93,6 +94,36 @@ ; GFX10-NEXT: global_store_dword v9, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v8, v3 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v9 +; GFX11-NEXT: v_mov_b32_e32 v1, v10 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v3, v12 +; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 @@ -159,6 +190,36 @@ ; GFX10-NEXT: global_store_dword v9, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v8, v3 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v9 +; GFX11-NEXT: v_mov_b32_e32 v1, v10 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v3, v12 +; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw: @@ -22,23 +23,23 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_3d_v4f32_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_3d_v4f32_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_lshl_b32 s8, s0, 16 +; GFX10PLUS-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10PLUS-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } @@ -104,6 +105,36 @@ ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_lshl_b32 s8, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 +; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v2, v7 +; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 @@ -172,6 +203,36 @@ ; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_lshl_b32 s8, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 +; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v2, v7 +; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { ; GFX6-LABEL: load_3d_v4f32_xyzw: @@ -17,19 +18,19 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_3d_v4f32_xyzw: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_3d_v4f32_xyzw: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } @@ -91,6 +92,35 @@ ; GFX10-NEXT: global_store_dword v8, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 @@ -155,6 +185,35 @@ ; GFX10-NEXT: global_store_dword v8, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 %v.err = extractvalue { <4 x float>, i32 } %v, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: @@ -140,125 +141,6 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { -; GFX10-LABEL: sample_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; GFX10-LABEL: sample_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { -; GFX10-LABEL: sample_c_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; GFX10-LABEL: sample_c_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { -; GFX10-LABEL: sample_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; GFX10-LABEL: sample_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { -; GFX10-LABEL: sample_c_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; GFX10-LABEL: sample_c_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body @@ -309,15 +191,6 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 - declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps void @image_store_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %data) { ; GFX6-LABEL: image_store_f32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) @@ -47,19 +48,33 @@ } define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray_a16: -; GCN: ; %bb.0: -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 -; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_bvh_intersect_ray_a16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX10-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX10-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 +; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh_intersect_ray_a16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v7, v5 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v8, v11 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v10, v9 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -98,19 +113,33 @@ } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray_a16: -; GCN: ; %bb.0: -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 -; GCN-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_bvh64_intersect_ray_a16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX10-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh64_intersect_ray_a16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v6 +; GFX11-NEXT: v_and_or_b32 v8, 0xffff, v9, v12 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v11, v10 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -192,6 +221,38 @@ ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v18, v0 +; GFX11-NEXT: v_mov_b32_e32 v19, v1 +; GFX11-NEXT: v_mov_b32_e32 v15, v2 +; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_mov_b32_e32 v17, v4 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-NEXT: v_readfirstlane_b32 s5, v12 +; GFX11-NEXT: v_readfirstlane_b32 s6, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v14 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7] +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -276,6 +337,45 @@ ; GFX1013-NEXT: v_mov_b32_e32 v2, v15 ; GFX1013-NEXT: v_mov_b32_e32 v3, v16 ; GFX1013-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_mov_b32_e32 v15, v2 +; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_mov_b32_e32 v17, v4 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v7, v2 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v3 +; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v10 +; GFX11-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11_vgpr12 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -359,6 +459,39 @@ ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v19, v0 +; GFX11-NEXT: v_mov_b32_e32 v20, v1 +; GFX11-NEXT: v_mov_b32_e32 v21, v2 +; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_mov_b32_e32 v17, v4 +; GFX11-NEXT: v_mov_b32_e32 v18, v5 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v12 +; GFX11-NEXT: v_readfirstlane_b32 s5, v13 +; GFX11-NEXT: v_readfirstlane_b32 s6, v14 +; GFX11-NEXT: v_readfirstlane_b32 s7, v15 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7] +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr19_vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr9_vgpr10_vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -449,6 +582,46 @@ ; GFX1013-NEXT: v_mov_b32_e32 v2, v22 ; GFX1013-NEXT: v_mov_b32_e32 v3, v23 ; GFX1013-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: v_mov_b32_e32 v15, v1 +; GFX11-NEXT: v_mov_b32_e32 v16, v2 +; GFX11-NEXT: v_mov_b32_e32 v17, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX11-NEXT: v_mov_b32_e32 v18, v4 +; GFX11-NEXT: v_mov_b32_e32 v19, v5 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v8, v2 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v9, v3 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_readfirstlane_b32 s4, v10 +; GFX11-NEXT: v_readfirstlane_b32 s5, v11 +; GFX11-NEXT: v_readfirstlane_b32 s6, v12 +; GFX11-NEXT: v_readfirstlane_b32 s7, v13 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr17_vgpr18_vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> ret <4 x float> %r @@ -518,6 +691,47 @@ ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s9, 0x40400000 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 +; GFX11-NEXT: s_mov_b32 s10, 4.0 +; GFX11-NEXT: s_mov_b32 s14, 0x41000000 +; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v6, s12 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_mov_b32 s7, 1.0 +; GFX11-NEXT: flat_load_b32 v9, v[0:1] +; GFX11-NEXT: flat_load_b32 v10, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s9 +; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: v_mov_b32_e32 v4, s10 +; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[0:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid %node_ptr = load i32, i32* %gep_node_ptr, align 4 @@ -631,6 +845,41 @@ ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s9, 0x42004600 +; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s10, 0x44004700 +; GFX11-NEXT: s_mov_b32 s11, 0x45004800 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_mov_b32 s7, 1.0 +; GFX11-NEXT: flat_load_b32 v6, v[0:1] +; GFX11-NEXT: flat_load_b32 v7, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s9 +; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: v_mov_b32_e32 v4, s10 +; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[0:3] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid %node_ptr = load i32, i32* %gep_node_ptr, align 4 @@ -708,6 +957,46 @@ ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s9, 0x40400000 +; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s7, 1.0 +; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 +; GFX11-NEXT: s_mov_b32 s10, 4.0 +; GFX11-NEXT: s_mov_b32 s14, 0x41000000 +; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v3, s9 +; GFX11-NEXT: v_mov_b32_e32 v6, s12 +; GFX11-NEXT: v_mov_b32_e32 v4, s10 +; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: v_mov_b32_e32 v7, s13 +; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7 +; GFX11-NEXT: s_movk_i32 s5, 0x102 +; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v9, s4 +; GFX11-NEXT: flat_load_b32 v11, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 @@ -813,6 +1102,40 @@ ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b32 s9, 0x42004600 +; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s7, 1.0 +; GFX11-NEXT: s_mov_b32 s10, 0x44004700 +; GFX11-NEXT: s_mov_b32 s11, 0x45004800 +; GFX11-NEXT: v_mov_b32_e32 v3, s9 +; GFX11-NEXT: v_mov_b32_e32 v4, s10 +; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6 +; GFX11-NEXT: s_movk_i32 s5, 0x102 +; GFX11-NEXT: v_mov_b32_e32 v7, s5 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v6, s4 +; GFX11-NEXT: flat_load_b32 v8, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s8 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid %ray_extent = load float, float* %gep_ray, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { @@ -44,6 +45,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_f16_tfe_dmask0: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 @@ -112,6 +133,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_f16_tfe_dmask1: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 @@ -180,6 +221,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_v2f16_tfe_dmask0: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 @@ -248,6 +309,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_v2f16_tfe_dmask1: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 @@ -316,6 +397,26 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_v2f16_tfe_dmask3: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 @@ -393,6 +494,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_v3f16_tfe_dmask7: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 @@ -469,6 +593,27 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; +; GFX11-LABEL: load_1d_v4f16_tfe_dmask15: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s11, s9 +; GFX11-NEXT: s_mov_b32 s10, s8 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s7, s5 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s5, s3 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v[0:1], v[1:2], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm +; ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15: ; GFX8-UNPACKED: ; %bb.0: ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX9-LABEL: load_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX9-LABEL: load_1d: @@ -14,6 +15,12 @@ ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -32,6 +39,12 @@ ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %t = extractelement <2 x i16> %coords, i32 1 @@ -51,6 +64,12 @@ ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_3d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -71,6 +90,12 @@ ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_cube: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -91,6 +116,12 @@ ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %slice = extractelement <2 x i16> %coords, i32 1 @@ -110,6 +141,12 @@ ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -130,6 +167,12 @@ ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x30,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2dmsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -150,6 +193,12 @@ ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x38,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darraymsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -171,6 +220,12 @@ ; GFX10-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x05,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %mip = extractelement <2 x i16> %coords, i32 1 @@ -190,6 +245,12 @@ ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x05,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -210,6 +271,12 @@ ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_3d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x05,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -231,6 +298,12 @@ ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_cube: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x05,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -252,6 +325,12 @@ ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_1darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x05,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %slice = extractelement <2 x i16> %coords_lo, i32 1 @@ -272,6 +351,12 @@ ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_2darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x05,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -291,6 +376,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -307,6 +397,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 %t = extractelement <2 x i16> %coords, i32 1 @@ -324,6 +419,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_3d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -342,6 +442,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_cube: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -360,6 +465,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 %slice = extractelement <2 x i16> %coords, i32 1 @@ -377,6 +487,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_2darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -395,6 +510,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x30,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_2dmsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -413,6 +533,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x38,0x1f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_2darraymsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -432,6 +557,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x24,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_mip_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 %mip = extractelement <2 x i16> %coords, i32 1 @@ -449,6 +579,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x24,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_mip_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -467,6 +602,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x24,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_mip_3d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -486,6 +626,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x24,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_mip_cube: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -505,6 +650,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x24,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_mip_1darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %slice = extractelement <2 x i16> %coords_lo, i32 1 @@ -523,6 +673,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x24,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_mip_2darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 %t = extractelement <2 x i16> %coords_lo, i32 1 @@ -544,6 +699,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -562,6 +723,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -580,6 +747,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_3d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -598,6 +771,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_cube: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -616,6 +795,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_1darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -634,6 +819,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_2darray: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -652,6 +843,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x30,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_2dmsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -670,6 +867,12 @@ ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x38,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_2darraymsaa: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x5d,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -688,6 +891,12 @@ ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x18,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_V1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x08,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %v = call float @llvm.amdgcn.image.load.1d.f32.i16(i32 8, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -706,6 +915,12 @@ ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x19,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_V2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x09,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32 9, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -722,6 +937,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x12,0x20,0xf0,0x01,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1d_V1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x02,0x19,0xf0,0x01,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.f32.i16(float %vdata, i32 2, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -738,6 +958,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1c,0x20,0xf0,0x02,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1d_V2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0c,0x19,0xf0,0x02,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float> %vdata, i32 12, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -756,6 +981,12 @@ ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x00,0x3f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_glc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x80,0x4f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -774,6 +1005,12 @@ ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x00,0x1f,0x00,0xf2,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_slc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x80,0x1f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -792,6 +1029,12 @@ ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x00,0x3f,0x00,0xf2,0x00,0x00,0x00,0x40] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_glc_slc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x80,0x5f,0x01,0xf0,0x00,0x00,0x00,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -808,6 +1051,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x00,0x3f,0x20,0xf0,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1d_glc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x80,0x4f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -824,6 +1072,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x00,0x1f,0x20,0xf2,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1d_slc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x80,0x1f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -840,6 +1093,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x00,0x3f,0x20,0xf2,0x04,0x00,0x00,0x40] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: store_1d_glc_slc: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x80,0x5f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: %s = extractelement <2 x i16> %coords, i32 0 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -854,6 +1112,10 @@ ; GFX10-LABEL: getresinfo_dmask0: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: getresinfo_dmask0: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 0, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s ; GCN-LABEL: {{^}}atomic_swap_1d: ; GFX6789: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX89 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}image_load_f16: ; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -3,7 +3,8 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FIJI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { ; VERDE-LABEL: load_1d: @@ -30,11 +31,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -104,22 +105,41 @@ ; ; GFX10-LABEL: load_1d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: v_mov_b32_e32 v10, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: v_mov_b32_e32 v4, v10 +; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v6, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 +; GFX11-NEXT: v_mov_b32_e32 v3, v9 +; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -192,22 +212,41 @@ ; ; GFX10-LABEL: load_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: v_mov_b32_e32 v10, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: v_mov_b32_e32 v4, v10 +; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v6, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_lwe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 +; GFX11-NEXT: v_mov_b32_e32 v3, v9 +; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -241,11 +280,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -318,23 +357,43 @@ ; ; GFX10-LABEL: load_2d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v7, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2d_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -368,11 +427,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -448,24 +507,45 @@ ; ; GFX10-LABEL: load_3d_tfe_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_3d_tfe_lwe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -499,11 +579,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -579,24 +659,45 @@ ; ; GFX10-LABEL: load_cube_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_cube_lwe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -630,11 +731,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -707,23 +808,43 @@ ; ; GFX10-LABEL: load_1darray_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v7, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1darray_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -757,11 +878,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -837,24 +958,45 @@ ; ; GFX10-LABEL: load_2darray_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darray_lwe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -888,11 +1030,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2dmsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x30,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2dmsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -968,24 +1110,45 @@ ; ; GFX10-LABEL: load_2dmsaa_both: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 +; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2dmsaa_both: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1019,11 +1182,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_2darraymsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x38,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_2darraymsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -1102,25 +1265,47 @@ ; ; GFX10-LABEL: load_2darraymsaa_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v12 ; encoding: [0x0c,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] -; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v9, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x09,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v9 +; GFX10-NEXT: v_mov_b32_e32 v11, v9 +; GFX10-NEXT: v_mov_b32_e32 v12, v9 +; GFX10-NEXT: v_mov_b32_e32 v13, v9 +; GFX10-NEXT: v_mov_b32_e32 v0, v9 +; GFX10-NEXT: v_mov_b32_e32 v1, v10 +; GFX10-NEXT: v_mov_b32_e32 v2, v11 +; GFX10-NEXT: v_mov_b32_e32 v3, v12 +; GFX10-NEXT: v_mov_b32_e32 v4, v13 +; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v9, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_2darraymsaa_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_mov_b32_e32 v8, v3 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v0, v9 +; GFX11-NEXT: v_mov_b32_e32 v1, v10 +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mov_b32_e32 v3, v12 +; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1154,11 +1339,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -1231,23 +1416,43 @@ ; ; GFX10-LABEL: load_mip_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v7, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_1d_lwe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1281,11 +1486,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -1361,24 +1566,45 @@ ; ; GFX10-LABEL: load_mip_2d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; encoding: [0x80,0x02,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; encoding: [0x08,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; encoding: [0x09,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; encoding: [0x0a,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; encoding: [0x0b,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v8, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x08,0x04,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 +; GFX10-NEXT: v_mov_b32_e32 v10, v8 +; GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-NEXT: v_mov_b32_e32 v12, v8 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_mov_b32_e32 v2, v10 +; GFX10-NEXT: v_mov_b32_e32 v3, v11 +; GFX10-NEXT: v_mov_b32_e32 v4, v12 +; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v8, v4, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_mip_2d_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v1, v9 +; GFX11-NEXT: v_mov_b32_e32 v2, v10 +; GFX11-NEXT: v_mov_b32_e32 v3, v11 +; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1423,14 +1649,14 @@ ; NOPRT-NEXT: v_mov_b32_e32 v0, v1 ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_V2_tfe_dmask0: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; encoding: [0x01,0x03,0x04,0x7e] -; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x11,0x01,0xf0,0x00,0x01,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_V2_tfe_dmask0: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v1 +; GFX10PLUS-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v2 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue {<2 x float>, i32} %v, 1 @@ -1474,14 +1700,14 @@ ; NOPRT-NEXT: v_mov_b32_e32 v0, v1 ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_V1_tfe_dmask0: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; encoding: [0x01,0x03,0x04,0x7e] -; GFX10-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x11,0x01,0xf0,0x00,0x01,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_V1_tfe_dmask0: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v1 +; GFX10PLUS-NEXT: image_load v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v2 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue {float, i32} %v, 1 @@ -1525,14 +1751,14 @@ ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_2d_tfe_dmask0: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x11,0x05,0xf0,0x00,0x03,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_2d_tfe_dmask0: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v3 +; GFX10PLUS-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v4 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 0, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue {<4 x float>, i32} %v, 1 @@ -1576,14 +1802,14 @@ ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_2d_tfe_nouse: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x11,0x05,0xf0,0x00,0x03,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_2d_tfe_nouse: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v3 +; GFX10PLUS-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v4 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue {<4 x float>, i32} %v, 1 @@ -1627,14 +1853,14 @@ ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_2d_tfe_nouse_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x11,0x05,0xf0,0x00,0x03,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_2d_tfe_nouse_V2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v3 +; GFX10PLUS-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v4 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 6, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue {<2 x float>, i32} %v, 1 @@ -1678,14 +1904,14 @@ ; NOPRT-NEXT: v_mov_b32_e32 v0, v3 ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_2d_tfe_nouse_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] -; GFX10-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x12,0x05,0xf0,0x00,0x03,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_2d_tfe_nouse_V1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v3 +; GFX10PLUS-NEXT: image_load_mip v[3:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v4 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 2, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) %v.err = extractvalue {float, i32} %v, 1 @@ -1753,20 +1979,37 @@ ; ; GFX10-LABEL: load_1d_tfe_V4_dmask3: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; encoding: [0x05,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; encoding: [0x05,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; encoding: [0x05,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; encoding: [0x05,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; encoding: [0x06,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; encoding: [0x07,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v8 ; encoding: [0x08,0x03,0x06,0x7e] -; GFX10-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x17,0x01,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v5, v3, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x03,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 +; GFX10-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v3, v8 +; GFX10-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v5, v3, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_tfe_V4_dmask3: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mov_b32_e32 v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mov_b32_e32 v1, v6 +; GFX11-NEXT: v_mov_b32_e32 v2, v7 +; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v5, v3, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1831,18 +2074,33 @@ ; ; GFX10-LABEL: load_1d_tfe_V4_dmask2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; encoding: [0x80,0x02,0x08,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; encoding: [0x04,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; encoding: [0x04,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; encoding: [0x05,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v6 ; encoding: [0x06,0x03,0x04,0x7e] -; GFX10-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x16,0x01,0xf0,0x03,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v4, v2, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x02,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v4, v2, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_tfe_V4_dmask2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v4, v2, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1903,16 +2161,29 @@ ; ; GFX10-LABEL: load_1d_tfe_V4_dmask1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e] -; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v3, v1, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_tfe_V4_dmask1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v3, v1, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -1973,16 +2244,29 @@ ; ; GFX10-LABEL: load_1d_tfe_V2_dmask1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e] -; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v3, v1, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: load_1d_tfe_V2_dmask1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v3, v1, s[8:9] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<2 x float>, i32} %v, 0 @@ -2017,11 +2301,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2052,11 +2336,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2087,11 +2371,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2122,11 +2406,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_mip_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_mip_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2153,10 +2437,10 @@ ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2183,10 +2467,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2213,10 +2497,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2243,10 +2527,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2273,10 +2557,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2303,10 +2587,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2333,10 +2617,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2dmsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x30,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_2dmsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2363,10 +2647,10 @@ ; NOPRT-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_2darraymsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x38,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_2darraymsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2393,10 +2677,10 @@ ; NOPRT-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x24,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_mip_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2423,10 +2707,10 @@ ; NOPRT-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x24,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_mip_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2453,10 +2737,10 @@ ; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x24,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_mip_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2483,10 +2767,10 @@ ; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x24,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_mip_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2513,10 +2797,10 @@ ; NOPRT-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x24,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_mip_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2543,10 +2827,10 @@ ; NOPRT-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_mip_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x24,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_mip_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2577,11 +2861,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2612,11 +2896,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2647,11 +2931,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2682,11 +2966,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2717,11 +3001,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2752,11 +3036,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2787,11 +3071,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_2dmsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x30,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_2dmsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2822,11 +3106,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_2darraymsaa: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x38,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_2darraymsaa: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -2857,11 +3141,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x18,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_V1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -2892,11 +3176,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x19,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_V2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -2923,10 +3207,10 @@ ; NOPRT-NEXT: image_store v0, v1, s[0:7] dmask:0x2 unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x12,0x20,0xf0,0x01,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1d_V1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2953,10 +3237,10 @@ ; NOPRT-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1c,0x20,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1d_V2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -2987,11 +3271,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_glc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ; encoding: [0x00,0x3f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_glc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) ret <4 x float> %v @@ -3022,11 +3306,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ; encoding: [0x00,0x1f,0x00,0xf2,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) ret <4 x float> %v @@ -3057,11 +3341,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_glc_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ; encoding: [0x00,0x3f,0x00,0xf2,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_glc_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) ret <4 x float> %v @@ -3088,10 +3372,10 @@ ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_glc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ; encoding: [0x00,0x3f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1d_glc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) ret void @@ -3118,10 +3402,10 @@ ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ; encoding: [0x00,0x1f,0x20,0xf2,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1d_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) ret void @@ -3148,10 +3432,10 @@ ; NOPRT-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: store_1d_glc_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ; encoding: [0x00,0x3f,0x20,0xf2,0x04,0x00,0x00,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: store_1d_glc_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) ret void @@ -3210,11 +3494,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_dmask7: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x17,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_dmask7: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <3 x float> @llvm.amdgcn.image.getresinfo.1d.v3f32.i32(i32 7, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %r @@ -3273,11 +3557,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_dmask3: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x13,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_dmask3: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <2 x float> @llvm.amdgcn.image.getresinfo.1d.v2f32.i32(i32 3, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %r @@ -3336,11 +3620,11 @@ ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_dmask1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_get_resinfo v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x11,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_dmask1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_get_resinfo v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret float %r @@ -3363,9 +3647,9 @@ ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: getresinfo_dmask0: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: getresinfo_dmask0: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 0, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %r @@ -3406,13 +3690,13 @@ ; NOPRT-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf unorm ; NOPRT-NEXT: s_endpgm ; -; GFX10-LABEL: image_store_wait: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x00,0xf0,0x04,0x00,0x02,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x20,0xf0,0x04,0x00,0x04,0x00] -; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; GFX10PLUS-LABEL: image_store_wait: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %arg4, <8 x i32> %arg1, i32 0, i32 0) @@ -3467,13 +3751,23 @@ ; ; GFX10-LABEL: image_load_mmo: ; GFX10: ; %bb.0: -; GFX10-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x11,0x00,0xf0,0x01,0x01,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] -; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] +; GFX10-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_load_mmo: +; GFX11: ; %bb.0: +; GFX11-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_store_2addr_b32 v0, v2, v2 offset1:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog store float 0.000000e+00, float addrspace(3)* %lds %c0 = extractelement <2 x i32> %c, i32 0 %c1 = extractelement <2 x i32> %c, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck --check-prefix=GCN %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16: ; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}gather4_2d: ; GFX6789: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefixes=GCN,PRE-GFX10 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}getlod_1d: ; PRE-GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}load.f16.1d: ; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}load.f32.1d: ; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -0,0 +1,133 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11 %s + +; GCN-LABEL: {{^}}load_2dmsaa: +; GFX11: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; +define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa_both: +; GFX11: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; +define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2darraymsaa: +; GFX11: image_msaa_load v[0:3], v[0:3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; +define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darraymsaa_tfe: +; GFX11: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; +define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2dmsaa_glc: +; GFX11: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc ; +define amdgpu_ps <4 x float> @load_2dmsaa_glc(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 1) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa_slc: +; GFX11: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm slc ; +define amdgpu_ps <4 x float> @load_2dmsaa_slc(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 2) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa_glc_slc: +; GFX11: image_msaa_load v[0:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc ; +define amdgpu_ps <4 x float> @load_2dmsaa_glc_slc(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 3) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa_d16: +; GFX11: image_msaa_load v[0:1], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; +define amdgpu_ps <4 x half> @load_2dmsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa_tfe_d16: +; GFX11: image_msaa_load v[0:2], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; +define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x half>, i32} %v, 0 + %v.err = extractvalue {<4 x half>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x half> %v.vec +} + +; GCN-LABEL: {{^}}load_2darraymsaa_d16: +; GFX11: image_msaa_load v[0:1], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; +define amdgpu_ps <4 x half> @load_2darraymsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x half> %v +} + +; GCN-LABEL: {{^}}load_2darraymsaa_tfe_d16: +; GFX11: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; +define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x half>, i32} %v, 0 + %v.err = extractvalue {<4 x half>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x half> %v.vec +} + +; GCN-LABEL: {{^}}load_2dmsaa_a16: +; GFX11: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; +define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darraymsaa_a16: +; GFX11: image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; +define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -1,6 +1,8 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010,NSA %s -; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030,NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX1010-NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX1030-NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX11-NSA %s ; GCN-LABEL: {{^}}sample_2d: ; @@ -25,8 +27,9 @@ } ; GCN-LABEL: {{^}}sample_d_3d: -; GFX1010: image_sample_d v[0:3], v[7:22], -; GFX1030: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1], +; GFX1010-NSA: image_sample_d v[0:3], v[7:22], +; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1], +; GFX11-NSA: image_sample_d v[0:3], v[7:22], define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) { main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -34,9 +37,14 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa: -; NONSA: image_sample_c_l v5, v[0:4], -; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4], -; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5], +; GFX10-NONSA: image_sample_c_l v5, v[0:4], +; GFX11-NONSA: image_sample_c_l v0, v[0:4], +; GFX1010-NSA: image_sample_c_l v8, v[0:4], +; GFX1010-NSA: image_sample v9, [v6, v7, v5], +; GFX1030-NSA: image_sample_c_l v0, v[0:4], +; GFX1030-NSA: image_sample v1, [v6, v7, v5], +; GFX11-NSA: image_sample_c_l v0, v[0:4], +; GFX11-NSA: image_sample v1, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -47,8 +55,12 @@ } ; GCN-LABEL: {{^}}sample_nsa_nsa: -; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0], -; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5], +; GFX1010-NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; GFX1010-NSA: image_sample v9, [v6, v7, v5], +; GFX1030-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], +; GFX1030-NSA: image_sample v1, [v6, v7, v5], +; GFX11-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], +; GFX11-NSA: image_sample v1, [v6, v7, v5], define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -59,8 +71,12 @@ } ; GCN-LABEL: {{^}}sample_nsa_contig: -; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0], -; NSA: image_sample v{{[0-9]+}}, v[5:7], +; GFX1010-NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0], +; GFX1010-NSA: image_sample v9, v[5:7], +; GFX1030-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], +; GFX1030-NSA: image_sample v1, v[5:7], +; GFX11-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0], +; GFX11-NSA: image_sample v1, v[5:7], define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -71,10 +87,16 @@ } ; GCN-LABEL: {{^}}sample_contig_contig: -; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4], -; NSA: image_sample v{{[0-9]+}}, v[5:7], -; NONSA: image_sample_c_l v{{[0-9]+}}, v[0:4], -; NONSA: image_sample v{{[0-9]+}}, v[5:7], +; GFX1010-NSA: image_sample_c_l v8, v[0:4], +; GFX1010-NSA: image_sample v9, v[5:7], +; GFX1030-NSA: image_sample_c_l v0, v[0:4], +; GFX1030-NSA: image_sample v1, v[5:7], +; GFX11-NSA: image_sample_c_l v0, v[0:4], +; GFX11-NSA: image_sample v1, v[5:7], +; GFX10-NONSA: image_sample_c_l v8, v[0:4], +; GFX10-NONSA: image_sample v9, v[5:7], +; GFX11-NONSA: image_sample_c_l v0, v[0:4], +; GFX11-NONSA: image_sample v1, v[5:7], define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) { main_body: %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { ; GFX9-LABEL: sample_1d: @@ -764,202 +765,6 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { -; GFX9-LABEL: sample_cd_1d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { -; GFX9-LABEL: sample_cd_2d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 -; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { -; GFX9-LABEL: sample_c_cd_1d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { -; GFX9-LABEL: sample_c_cd_2d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 -; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { -; GFX9-LABEL: sample_cd_cl_1d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { -; GFX9-LABEL: sample_cd_cl_2d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 -; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { -; GFX9-LABEL: sample_c_cd_cl_1d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { -; GFX9-LABEL: sample_c_cd_cl_2d: -; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 -; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { ; GFX9-LABEL: sample_l_1d: ; GFX9: ; %bb.0: ; %main_body @@ -1226,15 +1031,6 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 - declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32, float, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s) { +; GFX9-LABEL: sample_cd_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +; GFX9-LABEL: sample_cd_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s) { +; GFX9-LABEL: sample_c_cd_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { +; GFX9-LABEL: sample_c_cd_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { +; GFX9-LABEL: sample_cd_cl_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +; GFX9-LABEL: sample_cd_cl_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { +; GFX9-LABEL: sample_c_cd_cl_1d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { +; GFX9-LABEL: sample_c_cd_cl_2d: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32, float, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32, float, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32, float, half, half, half, half, half, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.dim.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) { +; VERDE-LABEL: sample_cd_1d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_cd_1d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +; VERDE-LABEL: sample_cd_2d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_cd_2d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { +; VERDE-LABEL: sample_c_cd_1d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_c_cd_1d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +; VERDE-LABEL: sample_c_cd_2d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_c_cd_2d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) { +; VERDE-LABEL: sample_cd_cl_1d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_cd_cl_1d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +; VERDE-LABEL: sample_cd_cl_2d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_cd_cl_2d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { +; VERDE-LABEL: sample_c_cd_cl_1d: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: ; return to shader part epilog +; +; GFX6789-LABEL: sample_c_cd_cl_1d: +; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf +; GFX6789-NEXT: s_waitcnt vmcnt(0) +; GFX6789-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] +; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { +; GFX10-LABEL: sample_c_cd_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { +; GFX10-LABEL: sample_c_cd_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_1d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { +; GFX10-LABEL: sample_c_cd_cl_2d: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; TONGA-LABEL: image_sample_2d_f16: @@ -32,14 +33,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_2d_f16: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_2d_f16: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) ret half %tex @@ -109,6 +110,22 @@ ; GFX10-NEXT: global_store_dword v4, v3, s[12:13] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_2d_f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s14, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v2, v4 +; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: global_store_b32 v4, v3, s[12:13] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {half, i32} %tex, 0 @@ -138,11 +155,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_c_d_1d_v2f16: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) %r = bitcast <2 x half> %tex to float @@ -182,15 +199,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16_tfe: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<2 x half>, i32} %tex, 0 @@ -233,14 +250,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_b_2d_v3f16: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_b_2d_v3f16: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> @@ -296,20 +313,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_b_2d_v3f16_tfe: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<3 x half>, i32} %tex, 0 @@ -357,14 +374,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_b_2d_v4f16: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_b_2d_v4f16: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) %r = bitcast <4 x half> %tex to <2 x float> @@ -421,20 +438,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: image_sample_b_2d_v4f16_tfe: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: image_sample_b_2d_v4f16_tfe: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<4 x half>, i32} %tex, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { ; VERDE-LABEL: sample_1d: @@ -22,14 +23,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -79,25 +80,47 @@ ; ; GFX10-LABEL: sample_1d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: v_mov_b32_e32 v10, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: v_mov_b32_e32 v4, v10 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v6, v4, s[12:13] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_1d_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s14, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 +; GFX11-NEXT: v_mov_b32_e32 v3, v9 +; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v6, v4, s[12:13] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -131,17 +154,17 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x01,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -178,17 +201,17 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x02,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -225,17 +248,17 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_3: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x04,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_3: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -272,17 +295,17 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_4: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x08,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_4: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -321,18 +344,18 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x03,0x81,0xf0,0x03,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_12: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -373,18 +396,18 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0a,0x81,0xf0,0x03,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_24: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -427,19 +450,19 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_tfe_adjust_writemask_134: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0d,0x81,0xf0,0x04,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_134: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -499,25 +522,47 @@ ; ; GFX10-LABEL: sample_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_mov_b32 s14, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v8, v6 +; GFX10-NEXT: v_mov_b32_e32 v9, v6 +; GFX10-NEXT: v_mov_b32_e32 v10, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v7 +; GFX10-NEXT: v_mov_b32_e32 v2, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v9 +; GFX10-NEXT: v_mov_b32_e32 v4, v10 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v6, v4, s[12:13] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_1d_lwe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s14, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v9, v6 +; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v0, v6 +; GFX11-NEXT: v_mov_b32_e32 v1, v7 +; GFX11-NEXT: v_mov_b32_e32 v2, v8 +; GFX11-NEXT: v_mov_b32_e32 v3, v9 +; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v6, v4, s[12:13] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -545,14 +590,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -577,14 +622,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_3d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_3d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -609,14 +654,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_cube: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; encoding: [0x18,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_cube: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -641,14 +686,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x20,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -673,14 +718,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_2darray: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_2darray: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -705,14 +750,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -737,14 +782,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -769,14 +814,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_cl_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -801,14 +846,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_cl_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -833,14 +878,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_cl_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -865,14 +910,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_cl_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -897,14 +942,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_b_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_b_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -929,14 +974,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_b_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_b_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -961,14 +1006,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_b_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_b_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -993,14 +1038,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_b_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_b_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1025,14 +1070,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_b_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_b_cl_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1057,14 +1102,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_b_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_b_cl_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1089,14 +1134,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_b_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_b_cl_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1121,14 +1166,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_b_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_b_cl_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1147,11 +1192,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_d_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_d_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1170,11 +1215,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_d_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_d_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1193,11 +1238,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1216,11 +1261,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1239,11 +1284,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_d_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_d_cl_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1262,11 +1307,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_d_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_d_cl_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1285,11 +1330,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_cl_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1308,200 +1353,16 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_cl_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) { -; VERDE-LABEL: sample_cd_1d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_cd_1d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { -; VERDE-LABEL: sample_cd_2d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_cd_2d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { -; VERDE-LABEL: sample_c_cd_1d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_c_cd_1d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { -; VERDE-LABEL: sample_c_cd_2d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_c_cd_2d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) { -; VERDE-LABEL: sample_cd_cl_1d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_cd_cl_1d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { -; VERDE-LABEL: sample_cd_cl_2d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_cd_cl_2d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { -; VERDE-LABEL: sample_c_cd_cl_1d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_c_cd_cl_1d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { -; VERDE-LABEL: sample_c_cd_cl_2d: -; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: ; return to shader part epilog -; -; GFX6789-LABEL: sample_c_cd_cl_2d: -; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: sample_c_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) { ; VERDE-LABEL: sample_l_1d: ; VERDE: ; %bb.0: ; %main_body @@ -1515,11 +1376,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_l_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x90,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_l_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1538,11 +1399,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_l_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x90,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_l_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1561,11 +1422,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_l_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb0,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_l_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1584,11 +1445,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_l_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb0,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_l_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1607,11 +1468,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_lz_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x9c,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_lz_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1630,11 +1491,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_lz_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x9c,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_lz_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1653,11 +1514,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_lz_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xbc,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_lz_1d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1676,11 +1537,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_lz_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xbc,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_lz_2d: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1699,11 +1560,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_o_2darray_V1: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_o_2darray_V1: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret float %v @@ -1738,16 +1599,29 @@ ; ; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v9, v11 ; encoding: [0x0b,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v12 ; encoding: [0x0c,0x03,0x14,0x7e] -; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x28,0x04,0xe9,0xf0,0x00,0x09,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e] -; GFX10-NEXT: global_store_dword v11, v10, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x0a,0x0c,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, v11 +; GFX10-NEXT: v_mov_b32_e32 v9, v11 +; GFX10-NEXT: v_mov_b32_e32 v10, v12 +; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v9 +; GFX10-NEXT: global_store_dword v11, v10, s[12:13] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_o_2darray_V1_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: v_mov_b32_e32 v12, v11 +; GFX11-NEXT: v_mov_b32_e32 v9, v11 +; GFX11-NEXT: v_mov_b32_e32 v10, v12 +; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v9 +; GFX11-NEXT: global_store_b32 v11, v10, s[12:13] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %v.vec = extractvalue {float, i32} %v, 0 @@ -1769,11 +1643,11 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_o_2darray_V2: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <2 x float> %v @@ -1804,17 +1678,17 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v11 ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_c_d_o_2darray_V2_tfe: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] -; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x28,0x06,0xe9,0xf0,0x00,0x09,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2_tfe: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: v_mov_b32_e32 v9, 0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v10, v9 +; GFX10PLUS-NEXT: v_mov_b32_e32 v11, v9 +; GFX10PLUS-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v9 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v10 +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v11 +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %v.vec = extractvalue {<2 x float>, i32} %v, 0 @@ -1847,14 +1721,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_unorm: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_unorm: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0) ret <4 x float> %v @@ -1879,14 +1753,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_glc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; encoding: [0x00,0x2f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_glc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1) ret <4 x float> %v @@ -1911,14 +1785,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; encoding: [0x00,0x0f,0x80,0xf2,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D slc +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 2) ret <4 x float> %v @@ -1943,14 +1817,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: sample_1d_glc_slc: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc slc ; encoding: [0x00,0x2f,0x80,0xf2,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: sample_1d_glc_slc: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc slc +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 3) ret <4 x float> %v @@ -1975,14 +1849,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_0: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_0: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %elt0 = extractelement <4 x float> %r, i32 0 @@ -2008,14 +1882,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_01: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x03,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_01: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> @@ -2041,14 +1915,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_012: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x07,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_012: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <3 x i32> @@ -2074,14 +1948,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_12: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_12: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> @@ -2107,14 +1981,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_03: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x09,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_03: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> @@ -2140,14 +2014,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_13: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_13: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> @@ -2173,14 +2047,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_123: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_123: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <3 x i32> @@ -2196,9 +2070,9 @@ ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_none_enabled: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_none_enabled: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %r @@ -2223,14 +2097,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_123_to_12: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_123_to_12: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 14, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> @@ -2256,14 +2130,14 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: adjust_writemask_sample_013_to_13: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] -; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: adjust_writemask_sample_013_to_13: +; GFX10PLUS: ; %bb.0: ; %main_body +; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo +; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10PLUS-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 11, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> @@ -2303,15 +2177,6 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 - declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: @@ -7,6 +8,12 @@ ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_d_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -22,6 +29,16 @@ ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_d_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -39,6 +56,18 @@ ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_d_3d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04] +; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -50,6 +79,12 @@ ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -65,6 +100,16 @@ ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04] +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04] +; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -76,6 +121,12 @@ ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_d_cl_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -91,6 +142,16 @@ ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_d_cl_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -102,6 +163,12 @@ ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_cl_1d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -119,117 +186,23 @@ ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_cl_2d: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04] +; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04] +; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { -; GFX10-LABEL: sample_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; GFX10-LABEL: sample_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { -; GFX10-LABEL: sample_c_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; GFX10-LABEL: sample_c_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { -; GFX10-LABEL: sample_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; GFX10-LABEL: sample_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { -; GFX10-LABEL: sample_c_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; GFX10-LABEL: sample_c_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body @@ -244,6 +217,20 @@ ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_o_2darray_V1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] +; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] +; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret float %v @@ -263,6 +250,20 @@ ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_o_2darray_V2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] +; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] +; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <2 x float> %v @@ -278,15 +279,6 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 - declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: @@ -124,112 +125,6 @@ ret <4 x float> %v } -define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { -; GFX10-LABEL: sample_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; GFX10-LABEL: sample_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { -; GFX10-LABEL: sample_c_cd_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { -; GFX10-LABEL: sample_c_cd_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { -; GFX10-LABEL: sample_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; GFX10-LABEL: sample_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { -; GFX10-LABEL: sample_c_cd_cl_1d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - -define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { -; GFX10-LABEL: sample_c_cd_cl_2d: -; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 -; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -main_body: - %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) - ret <4 x float> %v -} - define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body @@ -278,15 +173,6 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 - declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps void @store_f16_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) { ; GFX9-LABEL: store_f16_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) { ; GFX9-LABEL: store_f32_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s -; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s +; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s ; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) @@ -38,26 +39,49 @@ } define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray_a16: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s15, s12 -; GCN-NEXT: s_mov_b32 s12, s9 -; GCN-NEXT: s_lshr_b32 s9, s7, 16 -; GCN-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GCN-NEXT: s_pack_ll_b32_b16 s7, s9, s8 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: s_mov_b32 s14, s11 -; GCN-NEXT: s_mov_b32 s13, s10 -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_bvh_intersect_ray_a16: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s15, s12 +; GFX10-NEXT: s_mov_b32 s12, s9 +; GFX10-NEXT: s_lshr_b32 s9, s7, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s8 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: s_mov_b32 s14, s11 +; GFX10-NEXT: s_mov_b32 s13, s10 +; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh_intersect_ray_a16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s2, s7, 16 +; GFX11-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7 +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: s_mov_b32 s15, s12 +; GFX11-NEXT: s_mov_b32 s14, s11 +; GFX11-NEXT: s_mov_b32 s13, s10 +; GFX11-NEXT: s_mov_b32 s12, s9 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -89,27 +113,51 @@ } define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray_a16: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b32 s14, s12 -; GCN-NEXT: s_mov_b32 s12, s10 -; GCN-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GCN-NEXT: s_pack_ll_b32_b16 s8, s10, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NEXT: s_mov_b32 s15, s13 -; GCN-NEXT: s_mov_b32 s13, s11 -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_bvh64_intersect_ray_a16: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s14, s12 +; GFX10-NEXT: s_mov_b32 s12, s10 +; GFX10-NEXT: s_lshr_b32 s10, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s8 +; GFX10-NEXT: s_mov_b32 s15, s13 +; GFX10-NEXT: s_mov_b32 s13, s11 +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_bvh64_intersect_ray_a16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_lshr_b32 s0, s8, 16 +; GFX11-NEXT: s_lshr_b32 s3, s6, 16 +; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: s_mov_b32 s15, s13 +; GFX11-NEXT: s_mov_b32 s14, s12 +; GFX11-NEXT: s_mov_b32 s13, s11 +; GFX11-NEXT: s_mov_b32 s12, s10 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -172,6 +220,33 @@ ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 +; GFX11-NEXT: flat_load_b32 v9, v[0:1] +; GFX11-NEXT: flat_load_b32 v10, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[0:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid @@ -240,6 +315,30 @@ ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 +; GFX11-NEXT: flat_load_b32 v6, v[0:1] +; GFX11-NEXT: flat_load_b32 v7, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid @@ -312,6 +411,32 @@ ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 +; GFX11-NEXT: v_mov_b32_e32 v10, 0x102 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX11-NEXT: flat_load_b32 v11, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid @@ -376,6 +501,29 @@ ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm +; +; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX11-NEXT: v_mov_b32_e32 v7, 0x102 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 +; GFX11-NEXT: flat_load_b32 v8, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX11-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx11.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx11.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-image-load-gfx11.mir @@ -0,0 +1,490 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX11 %s + +# GFX11-LABEL: name: image_load_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_load_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- +# GFX11-LABEL: name: image_load_merged_v1v3_reversed +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 + +name: image_load_merged_v1v3_reversed +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_merged_v2v2 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1 +# GFX11: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3 + +name: image_load_merged_v2v2 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5:vreg_64, %3:sgpr_256, 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5:vreg_64, %3:sgpr_256, 12, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_merged_v2v2_reversed +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3 +# GFX11: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1 + +name: image_load_merged_v2v2_reversed +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5:vreg_64, %3:sgpr_256, 12, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5:vreg_64, %3:sgpr_256, 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_merged_v3v1 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2 +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3 + +name: image_load_merged_v3v1 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_merged_v3v1_reversed +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 + +name: image_load_merged_v3v1_reversed +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_divided_merged +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) + +name: image_load_divided_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %9:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %7:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %11:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_divided_not_merged +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_divided_not_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vreg_128 = COPY %2 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + IMAGE_STORE_V4_V2 %4:vreg_128, %5:vreg_64, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_dmask_overlapped_not_merged +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_dmask_overlapped_not_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_dmask_not_disjoint_not_merged +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 11, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_dmask_not_disjoint_not_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 11, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_0 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %6, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_0 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %6, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_1 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %6, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %6, %4, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_1 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %5:vgpr_32 = COPY %2.sub3 + %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %6, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %6, %4, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_3 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_4 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_4 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_5 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_5 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_6 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_6 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_7 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_7 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_8 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V1_gfx11 %6, %3, 8, 1, -1, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 16, addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_8 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = COPY %5.sub0 + %7:vgpr_32 = IMAGE_LOAD_V1_V1_gfx11 %6, %3, 8, 1, -1, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_9 +# GFX11: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_9 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_10 +# GFX11: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_10 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_not_merged_11 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_load_not_merged_11 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx11 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx11 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_mip_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V3_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_load_mip_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V3_gfx11 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_MIP_V3_V3_gfx11 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + + +# GFX11-LABEL: name: image_load_mip_pck_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V3_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_load_mip_pck_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V3_gfx11 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V3_gfx11 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + + +# GFX11-LABEL: name: image_load_mip_pck_sgn_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V3_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_load_mip_pck_sgn_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V3_gfx11 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V3_gfx11 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_pck_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_load_pck_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_PCK_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_load_pck_sgn_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V2_gfx11 %5, %3, 15, 1, -1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_load_pck_sgn_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V2_gfx11 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V2_gfx11 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir @@ -0,0 +1,1013 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX11 %s + +# GFX11-LABEL: name: image_sample_l_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_l_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- +# GFX11-LABEL: name: image_sample_l_merged_v1v3_reversed +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 + +name: image_sample_l_merged_v1v3_reversed +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_merged_v2v2 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1 +# GFX11: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3 + +name: image_sample_l_merged_v2v2 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 12, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_merged_v2v2_reversed +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3 +# GFX11: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1 + +name: image_sample_l_merged_v2v2_reversed +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 12, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_merged_v3v1 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2 +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3 + +name: image_sample_l_merged_v3v1 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_merged_v3v1_reversed +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 + +name: image_sample_l_merged_v3v1_reversed +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_divided_merged +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), addrspace 4) + +name: image_sample_l_divided_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %9:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %7:vgpr_32, %7:vgpr_32, %7:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %11:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_divided_not_merged +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_divided_not_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vreg_128 = COPY %2 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + IMAGE_STORE_V4_V2_nsa_gfx11 %4:vreg_128, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_dmask_overlapped_not_merged +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_dmask_overlapped_not_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_dmask_not_disjoint_not_merged +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 11, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_dmask_not_disjoint_not_merged +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 4, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 11, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_0 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %6, %6, %6, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_0 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %6, %6, %6, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_1 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %6, %6, %6, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %6, %6, %6, %4, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_1 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %5:vgpr_32 = COPY %2.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %6, %6, %6, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %6, %6, %6, %4, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_2 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %6, %6, %6, %4, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %6, %6, %6, %4, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_2 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_128 = COPY $sgpr92_sgpr93_sgpr94_sgpr95 + %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %5:vgpr_32 = COPY %2.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %6, %6, %6, %4, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %6, %6, %6, %4, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_3 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_4 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_4 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_5 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_5 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_6 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_6 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_7 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V2_nsa_gfx11 %5, %5, %3, %2, 8, 1, -1, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_7 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V2_nsa_gfx11 %5, %5, %3, %2, 8, 1, -1, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_8 +# GFX11: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_8 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_9 +# GFX11: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load (s64), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_9 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + +# GFX11-LABEL: name: image_sample_l_not_merged_10 +# GFX11: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 4) + +name: image_sample_l_not_merged_10 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx11 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx11 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + + + +# GFX11-LABEL: name: image_sample_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V2_nsa_gfx11 %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_V1_V2_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_V3_V2_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_b_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_b_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_b_cl_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_b_cl_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_b_cl_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V5_nsa_gfx11 %5, %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_b_cl_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_b_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_b_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_cl_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_cl_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_cl_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_cl_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_b_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_b_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_b_cl_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V5_nsa_gfx11 %5, %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_b_cl_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_b_cl_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V6_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_b_cl_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_192 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V6_gfx11 %5:vreg_192, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V6_gfx11 %5:vreg_192, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_b_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V5_nsa_gfx11 %5, %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_b_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_cl_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_cl_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_cl_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V5_nsa_gfx11 %5, %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_cl_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_d_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V7_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_d_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_224 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V7_gfx11 %5:vreg_224, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V7_gfx11 %5:vreg_224, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_d_cl_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V8_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_d_cl_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_256 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V8_gfx11 %5:vreg_256, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V8_gfx11 %5:vreg_256, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_d_cl_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V16_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_d_cl_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_512 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_d_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V8_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_d_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_256 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V8_gfx11 %5:vreg_256, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V8_gfx11 %5:vreg_256, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_l_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_l_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_lz_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_lz_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_lz_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_lz_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_l_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V5_nsa_gfx11 %5, %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_l_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V5_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_c_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_c_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_d_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V6_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_d_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_192 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V6_gfx11 %5:vreg_192, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_V3_V6_gfx11 %5:vreg_192, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_d_cl_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V7_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_d_cl_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_224 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V7_gfx11 %5:vreg_224, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V7_gfx11 %5:vreg_224, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_d_cl_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V8_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_d_cl_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_256 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V8_gfx11 %5:vreg_256, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V8_gfx11 %5:vreg_256, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_d_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V7_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_d_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vreg_224 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V7_gfx11 %5:vreg_224, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V7_gfx11 %5:vreg_224, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_lz_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2_nsa_gfx11 %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_lz_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_lz_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_lz_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_l_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4_nsa_gfx11 %5, %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_l_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +--- + + +# GFX11-LABEL: name: image_sample_o_merged_v1v3 +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V3_nsa_gfx11 %5, %5, %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 +# GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 + +name: image_sample_o_merged_v1v3 +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 + %4:vgpr_32 = COPY %2.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_O_V3_V3_nsa_gfx11 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +... +---