Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIAddIMGInitPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -141,6 +142,9 @@ void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +void initializeSIAddIMGInitPass(PassRegistry &); +extern char &SIAddIMGInitID;; + void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -373,6 +373,16 @@ "Use ds_{read|write}_b128" >; +// Sparse texture support requires that all result registers are zeroed when +// PRTStrictNull is set to true. This feature is turned on for all architectures +// but is enabled as a feature in case there are situations where PRTStrictNull +// is disabled by the driver. +def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-ptr-strict-null", + "EnablePRTStrictNull", + "true", + "Enable zeroing of result registers for sparse texture fetches" +>; + // Unless +-flat-for-global is specified, turn on FlatForGlobal for // all OS-es on VI and newer hardware to avoid assertion failures due // to missing ADDR64 variants of MUBUF instructions. Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -297,6 +297,7 @@ bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; bool EnableDS128; + bool EnablePRTStrictNull; bool DumpCode; // Subtarget statically properties set by tablegen @@ -537,6 +538,12 @@ return getGeneration() < AMDGPUSubtarget::GFX9; } + /// \returns If target requires PRT Struct NULL support (zero result registers + /// for sparse texture support). + bool usePRTStrictNull() const { + return EnablePRTStrictNull; + } + bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -72,6 +72,9 @@ // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you // disable it. + // + // Similarly we want enable-prt-strict-null to be on by default and not to + // unset everything else if it is disabled SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); @@ -87,6 +90,8 @@ FullFS += "-fp32-denormals,"; } + FullFS += "+enable-ptr-strict-null,"; // This is overridden by a disable in FS + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -174,6 +179,7 @@ EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), EnableDS128(false), + EnablePRTStrictNull(false), DumpCode(false), FP64(false), Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -810,6 +810,7 @@ bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); + addPass(createSIAddIMGInitPass()); addPass(&SIFixSGPRCopiesID); return false; } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -90,6 +90,7 @@ R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp + SIAddIMGInit.cpp SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -163,6 +163,8 @@ defm _V3 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper ; + let VDataDwords = 8 in + defm _V8 : MIMG_NoSampler_Src_Helper ; } } @@ -395,6 +397,8 @@ defm _V3 : MIMG_Sampler_Src_Helper; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper; + let VDataDwords = 8 in + defm _V8 : MIMG_Sampler_Src_Helper; } } @@ -413,6 +417,8 @@ defm _V2 : MIMG_Sampler_Src_Helper; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper; + let VDataDwords = 8 in + defm _V8 : MIMG_Sampler_Src_Helper; } } Index: lib/Target/AMDGPU/SIAddIMGInit.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -0,0 +1,175 @@ +//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// Any MIMG instructions that use tfe or lwe require an initialization of the +/// result register that will be written in the case of a memory access failure +/// The required code is also added to tie this init code to the result of the +/// img instruction +/// +//===----------------------------------------------------------------------===// +// + +#define DEBUG_TYPE "si-img-init" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPULaneDominator.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SIAddIMGInit : public MachineFunctionPass { +public: + static char ID; + +public: + SIAddIMGInit() : MachineFunctionPass(ID) { + initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Add IMG init"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, + "SI Add IMG Init", false, false) + +char SIAddIMGInit::ID = 0; + +char &llvm::SIAddIMGInitID = SIAddIMGInit::ID; + +FunctionPass *llvm::createSIAddIMGInitPass() { + return new SIAddIMGInit(); +} + +bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); + bool Changed = false; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + auto Opcode = MI.getOpcode(); + if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore()) { + MachineOperand *tfe = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); + MachineOperand *lwe = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); + MachineOperand *d16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + + // Abandon attempts for instructions that don't have tfe or lwe fields + // Shouldn't be any at this point, but this will allow for future + // variants. + if (!tfe && !lwe) + continue; + + unsigned tfeVal = tfe->getImm(); + unsigned lweVal = lwe->getImm(); + unsigned d16Val = d16 ? d16->getImm() : 0; + + if (tfeVal || lweVal) { + // At least one of TFE or LWE are non-zero + // We have to insert a suitable initialization of the result value and + // tie this to the dest of the image instruction. + + const DebugLoc &DL = MI.getDebugLoc(); + + int dstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdata); + + // Calculate which dword we have to initialize to 0. + MachineOperand *MO_Dmask = + TII->getNamedOperand(MI, AMDGPU::OpName::dmask); + // Abandon attempt if no dmask operand is found. + if (!MO_Dmask) continue; + + unsigned dmask = MO_Dmask->getImm(); + // Determine the number of active lanes taking into account the + // Gather4 special case + unsigned activeLanes = + TII->isGather4(Opcode) ? 4 : countPopulation(dmask); + // Subreg indices are counted from 1 + // When D16 then we want next whole VGPR after write data. + bool Packed = !ST.hasUnpackedD16VMem(); + unsigned initIdx = + d16Val && Packed ? ((activeLanes + 1) >> 1) + 1 + : activeLanes + 1; + + // Abandon attempt if the dst size isn't large enough + // - this is in fact an error but this is picked up elsewhere and + // reported correctly. + uint32_t dstSize = + RI->getRegSizeInBits(*TII->getOpRegClass(MI, dstIdx)) / 32; + if (dstSize < initIdx) continue; + + // Create a register for the intialization value. + unsigned prevDst = + MRI.createVirtualRegister(TII->getOpRegClass(MI, dstIdx)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), prevDst); + + unsigned newDst = 0; // Final initialized value will be in here + + // If PRTStrictNull feature is enabled (the default) then initialize + // all the result registers to 0, otherwise just the error indication + // register (VGPRn+1) + unsigned sizeLeft = ST.usePRTStrictNull() ? initIdx : 1; + unsigned currIdx = ST.usePRTStrictNull() ? 1 : initIdx; + + for ( ; sizeLeft ; sizeLeft--, currIdx++ ) { + newDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, dstIdx)); + // Initialize dword + unsigned subReg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), subReg) + .addImm(0); + // Insert into the super-reg + BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), newDst) + .addReg(prevDst) + .addReg(subReg) + .addImm(currIdx); + + prevDst = newDst; + } + + // Add as an implicit operand + MachineInstrBuilder(MF,MI).addReg(newDst, RegState::Implicit); + + // Tie the just added implicit operand to the dst + MI.tieOperands(dstIdx, MI.getNumOperands() - 1); + + Changed = true; + } + } + } + } + + return Changed; +} Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -212,6 +212,7 @@ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -4469,6 +4470,22 @@ return Value == 0; } +static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, + SDValue *TFE, SDValue *LWE) { + auto TexFailCtrlConst = dyn_cast(TexFailCtrl.getNode()); + if (!TexFailCtrlConst) + return false; + + uint64_t Value = TexFailCtrlConst->getZExtValue(); + SDLoc DL(TexFailCtrlConst); + *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x1; + *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x2; + + return Value == 0; +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -4532,7 +4549,16 @@ IsD16 = true; if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) - ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : + (LoadVT == MVT::v4f16) ? MVT::v4i32 : MVT::v8i32; + else if (LoadVT.isVector() && LoadVT == MVT::v8f16) + // Rather than add lots of code to handle v8f16 for this case, just + // treat it as v4i32 - this is reasonable since this is only done for + // TFE/LWE support anyway and the result is a mixture of 4 packed + // 16 bit values and a 32 bit error condition (plus an unused 32 bit + // value) : + // [ Res0 : Res1 ][ Res2 : Res3 ][ ErrCode ][ unused ] + ResultTypes[0] = MVT::v4i32; } NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; @@ -4581,9 +4607,10 @@ CtrlIdx = AddrIdx + NumVAddrs + 3; } + SDValue TFE; + SDValue LWE; SDValue TexFail = Op.getOperand(CtrlIdx); - auto TexFailConst = dyn_cast(TexFail.getNode()); - if (!TexFailConst || TexFailConst->getZExtValue() != 0) + if (!parseTexFail(TexFail, DAG, &TFE, &LWE)) return Op; SDValue GLC; @@ -4609,8 +4636,8 @@ Ops.push_back(GLC); Ops.push_back(SLC); Ops.push_back(False); // r128 - Ops.push_back(False); // tfe - Ops.push_back(False); // lwe + Ops.push_back(TFE); // tfe + Ops.push_back(LWE); // lwe Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); @@ -7748,6 +7775,7 @@ case AMDGPU::sub1: return 1; case AMDGPU::sub2: return 2; case AMDGPU::sub3: return 3; + case AMDGPU::sub4: return 4; // This might be returned when using TFE/LWE } } @@ -7761,11 +7789,16 @@ if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) return Node; // not implemented for D16 - SDNode *Users[4] = { nullptr }; + SDNode *Users[5] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; + unsigned TFEIdx = DmaskIdx + 5; + unsigned LWEIdx = DmaskIdx + 6; + unsigned UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; + unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; if (OldDmask == 0) { @@ -7773,6 +7806,12 @@ return Node; } + // Work out which is the TFE/LWE lane if that is enabled. + if (UsesTFC) { + unsigned OldBitsSet = countPopulation(OldDmask); + TFCLane = OldBitsSet; + } + // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { @@ -7792,19 +7831,24 @@ // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } + // Check if the use is for the TFE/LWE generated result at VGPRn+1. + if (UsesTFC && Lane == TFCLane) { + Users[Lane] = *I; + } else { + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } - // Abort if we have more than one user per component - if (Users[Lane]) - return Node; + // Abort if we have more than one user per component. + if (Users[Lane]) + return Node; - Users[Lane] = *I; - NewDmask |= 1 << Comp; + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } } // Abort if there's no change @@ -7813,7 +7857,13 @@ unsigned BitsSet = countPopulation(NewDmask); - int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); + // Check for TFE or LWE - increase the number of channels by one to account + // for the extra return value + // This will need adjustment for D16 if this is also included in + // adjustWriteMask (this function) but at present D16 are excluded. + unsigned NewChannels = BitsSet + UsesTFC; + + int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); assert(NewOpcode != -1 && NewOpcode != static_cast(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -7826,8 +7876,9 @@ MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - MVT ResultVT = BitsSet == 1 ? - SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + MVT ResultVT = NewChannels == 1 ? + SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 : + NewChannels == 5 ? 8 : NewChannels); SDVTList NewVTList = HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); @@ -7841,7 +7892,7 @@ DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); } - if (BitsSet == 1) { + if (NewChannels == 1) { assert(Node->hasNUsesOfValue(1, 0)); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), Users[Lane]->getValueType(0), @@ -7851,7 +7902,7 @@ } // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { SDNode *User = Users[i]; if (!User) continue; @@ -7864,6 +7915,7 @@ case AMDGPU::sub0: Idx = AMDGPU::sub1; break; case AMDGPU::sub1: Idx = AMDGPU::sub2; break; case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + case AMDGPU::sub3: Idx = AMDGPU::sub4; break; } } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2776,6 +2776,42 @@ } } + // Verify MIMG + if (isMIMG(MI.getOpcode()) && !get(MI.getOpcode()).mayStore()) { + // Ensure that the return type used is large enough for all the options + // being used TFE/LWE require an extra result register. + const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); + if (DMask) { + uint64_t DMaskImm = DMask->getImm(); + uint32_t RegCount = isGather4(MI.getOpcode()) ? 4 + : countPopulation(DMaskImm); + const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); + const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); + const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); + bool IsD16 = D16 ? D16->getImm() : false; + + if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) + RegCount += IsD16 ? 2 : 1; + + // Adjust for D16 variants + bool Packed = !ST.hasUnpackedD16VMem(); + if (IsD16 && Packed) RegCount = (RegCount + 1) >> 1; + + const uint32_t DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdata); + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (Dst.isReg()) { + const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); + uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; + if (RegCount > DstSize) { + ErrInfo = + "MIMG instruction returns too many registers for dst register class"; + return false; + } + } + } + } + // Verify VOP*. Ignore multiple sgpr operands on writelane. if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,89 +1,222 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-ptr-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s ; GCN-LABEL: {{^}}load_1d: ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_1d_tfe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v5, s[0:7] dmask:0xf unorm tfe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v0, s[0:7] dmask:0xf unorm tfe{{$}} +define amdgpu_ps <8 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.1d.v8f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + ret <8 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_lwe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v5, s[0:7] dmask:0xf unorm lwe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v0, s[0:7] dmask:0xf unorm lwe{{$}} +define amdgpu_ps <8 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.1d.v8f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_2d: ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_2d_tfe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:6], s[0:7] dmask:0xf unorm tfe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:1], s[0:7] dmask:0xf unorm tfe{{$}} +define amdgpu_ps <8 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.2d.v8f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_3d: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_3d_tfe_lwe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe lwe{{$}} +define amdgpu_ps <8 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.3d.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_cube: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_cube_lwe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm lwe da{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm lwe da{{$}} +define amdgpu_ps <8 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.cube.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_1darray: ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_1darray_tfe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:6], s[0:7] dmask:0xf unorm tfe da{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:1], s[0:7] dmask:0xf unorm tfe da{{$}} +define amdgpu_ps <8 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.1darray.v8f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_2darray: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_2darray_lwe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm lwe da{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm lwe da{{$}} +define amdgpu_ps <8 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.2darray.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_2dmsaa: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_2dmsaa_both: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe lwe{{$}} +define amdgpu_ps <8 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.2dmsaa.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_2darraymsaa: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_2darraymsaa_tfe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe da{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe da{{$}} +define amdgpu_ps <8 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.2darraymsaa.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_mip_1d: ; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_mip_1d_lwe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load_mip v[0:7], v[5:6], s[0:7] dmask:0xf unorm lwe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load_mip v[0:7], v[0:1], s[0:7] dmask:0xf unorm lwe{{$}} +define amdgpu_ps <8 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.mip.1d.v8f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_mip_2d: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v } +; GCN-LABEL: {{^}}load_mip_2d_tfe: +; GCN: v_mov_b32_e32 v0, 0 +; GCN: image_load_mip v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe{{$}} +; NOPRT: v_mov_b32_e32 v4, 0 +; NOPRT: image_load_mip v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe{{$}} +define amdgpu_ps <8 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.load.mip.2d.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}load_mip_3d: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -92,6 +225,7 @@ ; GCN-LABEL: {{^}}load_mip_cube: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -100,6 +234,7 @@ ; GCN-LABEL: {{^}}load_mip_1darray: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -108,6 +243,7 @@ ; GCN-LABEL: {{^}}load_mip_2darray: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -116,6 +252,7 @@ ; GCN-LABEL: {{^}}store_1d: ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -124,6 +261,7 @@ ; GCN-LABEL: {{^}}store_2d: ; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { main_body: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -132,6 +270,7 @@ ; GCN-LABEL: {{^}}store_3d: ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) { main_body: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -140,6 +279,7 @@ ; GCN-LABEL: {{^}}store_cube: ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { main_body: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -148,6 +288,7 @@ ; GCN-LABEL: {{^}}store_1darray: ; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) { main_body: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -156,6 +297,7 @@ ; GCN-LABEL: {{^}}store_2darray: ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { main_body: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -164,6 +306,7 @@ ; GCN-LABEL: {{^}}store_2dmsaa: ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) { main_body: call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -172,6 +315,7 @@ ; GCN-LABEL: {{^}}store_2darraymsaa: ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) { main_body: call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -180,6 +324,7 @@ ; GCN-LABEL: {{^}}store_mip_1d: ; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -188,6 +333,7 @@ ; GCN-LABEL: {{^}}store_mip_2d: ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -196,6 +342,7 @@ ; GCN-LABEL: {{^}}store_mip_3d: ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -204,6 +351,7 @@ ; GCN-LABEL: {{^}}store_mip_cube: ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -212,6 +360,7 @@ ; GCN-LABEL: {{^}}store_mip_1darray: ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -220,6 +369,7 @@ ; GCN-LABEL: {{^}}store_mip_2darray: ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -228,6 +378,7 @@ ; GCN-LABEL: {{^}}getresinfo_1d: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -236,6 +387,7 @@ ; GCN-LABEL: {{^}}getresinfo_2d: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -244,6 +396,7 @@ ; GCN-LABEL: {{^}}getresinfo_3d: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -252,6 +405,7 @@ ; GCN-LABEL: {{^}}getresinfo_cube: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -260,6 +414,7 @@ ; GCN-LABEL: {{^}}getresinfo_1darray: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -268,6 +423,7 @@ ; GCN-LABEL: {{^}}getresinfo_2darray: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -276,6 +432,7 @@ ; GCN-LABEL: {{^}}getresinfo_2dmsaa: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -284,6 +441,7 @@ ; GCN-LABEL: {{^}}getresinfo_2darraymsaa: ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -292,6 +450,7 @@ ; GCN-LABEL: {{^}}load_1d_V1: ; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}} +; NOPRT: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}} define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -300,6 +459,7 @@ ; GCN-LABEL: {{^}}load_1d_V2: ; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}} +; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}} define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -308,6 +468,7 @@ ; GCN-LABEL: {{^}}store_1d_V1: ; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}} +; NOPRT: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}} define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -316,6 +477,7 @@ ; GCN-LABEL: {{^}}store_1d_V2: ; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}} +; NOPRT: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}} define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -324,6 +486,7 @@ ; GCN-LABEL: {{^}}load_1d_glc: ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}} +; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}} define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -332,6 +495,7 @@ ; GCN-LABEL: {{^}}load_1d_slc: ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}} +; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}} define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -340,6 +504,7 @@ ; GCN-LABEL: {{^}}load_1d_glc_slc: ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}} +; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}} define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -348,6 +513,7 @@ ; GCN-LABEL: {{^}}store_1d_glc: ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}} +; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}} define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -356,6 +522,7 @@ ; GCN-LABEL: {{^}}store_1d_slc: ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}} +; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}} define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -364,6 +531,7 @@ ; GCN-LABEL: {{^}}store_1d_glc_slc: ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}} +; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}} define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -404,23 +572,33 @@ store float 0.000000e+00, float addrspace(3)* %lds %c0 = extractelement <2 x i32> %c, i32 0 %c1 = extractelement <2 x i32> %c, i32 1 - %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0) + %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0) %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 store float 0.000000e+00, float addrspace(3)* %tmp2 ret float %tex } declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.1d.v8f32.i32(i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.2d.v8f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.3d.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.cube.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.1darray.v8f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.2darray.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.2dmsaa.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.2darraymsaa.v8f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.mip.1d.v8f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.load.mip.2d.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -10,6 +10,17 @@ ret half %tex } +; GCN-LABEL: {{^}}image_sample_2d_f16_tfe: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0 +; PACKED: image_sample v[{{[0-9]+:[0-9]+}}], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}} +; UNPACKED: image_sample v[{{[0-9]+:[0-9]+}}], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}} +define amdgpu_ps <2 x float> @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) + %r = bitcast <4 x half> %tex to <2 x float> + ret <2 x float> %r +} + ; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16: ; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}} ; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}} @@ -20,6 +31,17 @@ ret float %r } +; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0 +; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}} +; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}} +define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { +main_body: + %tex = call <4 x half> @llvm.amdgcn.image.sample.c.d.1d.v4f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) + %r = bitcast <4 x half> %tex to <2 x float> + ret <2 x float> %r +} + ; GCN-LABEL: {{^}}image_sample_b_2d_v4f16: ; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}} ; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}} @@ -30,9 +52,23 @@ ret <2 x float> %r } +; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0 +; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}} +; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}} +define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { +main_body: + %tex = call <8 x half> @llvm.amdgcn.image.sample.b.2d.v8f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) + %r = bitcast <8 x half> %tex to <4 x float> + ret <4 x float> %r +} + declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.image.sample.c.d.1d.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <8 x half> @llvm.amdgcn.image.sample.b.2d.v8f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -9,6 +9,114 @@ ret <4 x float> %v } +; GCN-LABEL: {{^}}sample_1d_tfe: +; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}} +define amdgpu_ps <8 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + ret <8 x float> %v +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1: +; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}} +define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f = extractelement <8 x float> %v, i32 0 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 + %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1 + ret <2 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2: +; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}} +define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f = extractelement <8 x float> %v, i32 1 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 + %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1 + ret <2 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3: +; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}} +define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f = extractelement <8 x float> %v, i32 2 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 + %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1 + ret <2 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4: +; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}} +define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f = extractelement <8 x float> %v, i32 3 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 + %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1 + ret <2 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12: +; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}} +define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f1 = extractelement <8 x float> %v, i32 0 + %res.f2 = extractelement <8 x float> %v, i32 1 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0 + %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1 + %res = insertelement <4 x float> %res.tmp2, float %res.err, i32 2 + ret <4 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24: +; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}} +define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f1 = extractelement <8 x float> %v, i32 1 + %res.f2 = extractelement <8 x float> %v, i32 3 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0 + %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1 + %res = insertelement <4 x float> %res.tmp2, float %res.err, i32 2 + ret <4 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134: +; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}} +define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) + %res.f1 = extractelement <8 x float> %v, i32 0 + %res.f2 = extractelement <8 x float> %v, i32 2 + %res.f3 = extractelement <8 x float> %v, i32 3 + %res.err = extractelement <8 x float> %v, i32 4 + %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0 + %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1 + %res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2 + %res = insertelement <4 x float> %res.tmp3, float %res.err, i32 3 + ret <4 x float> %res +} + +; GCN-LABEL: {{^}}sample_1d_lwe: +; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}} +define amdgpu_ps <8 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) + ret <8 x float> %v +} + ; GCN-LABEL: {{^}}sample_2d: ; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { @@ -491,6 +599,7 @@ } declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1