Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -41,6 +41,7 @@
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIAddIMGInitPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -141,6 +142,9 @@
 void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
 extern char &AMDGPUUseNativeCallsID;
 
+void initializeSIAddIMGInitPass(PassRegistry &);
+extern char &SIAddIMGInitID;;
+
 void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
 extern char &AMDGPUPerfHintAnalysisID;
 
Index: lib/Target/AMDGPU/AMDGPU.td
===================================================================
--- lib/Target/AMDGPU/AMDGPU.td
+++ lib/Target/AMDGPU/AMDGPU.td
@@ -373,6 +373,16 @@
   "Use ds_{read|write}_b128"
 >;
 
+// Sparse texture support requires that all result registers are zeroed when
+// PRTStrictNull is set to true. This feature is turned on for all architectures
+// but is enabled as a feature in case there are situations where PRTStrictNull
+// is disabled by the driver.
+def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-ptr-strict-null",
+  "EnablePRTStrictNull",
+  "true",
+  "Enable zeroing of result registers for sparse texture fetches"
+>;
+
 // Unless +-flat-for-global is specified, turn on FlatForGlobal for
 // all OS-es on VI and newer hardware to avoid assertion failures due
 // to missing ADDR64 variants of MUBUF instructions.
Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -297,6 +297,7 @@
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
   bool EnableDS128;
+  bool EnablePRTStrictNull;
   bool DumpCode;
 
   // Subtarget statically properties set by tablegen
@@ -537,6 +538,12 @@
     return getGeneration() < AMDGPUSubtarget::GFX9;
   }
 
+  /// \returns If target requires PRT Struct NULL support (zero result registers
+  /// for sparse texture support).
+  bool usePRTStrictNull() const {
+    return EnablePRTStrictNull;
+  }
+
   bool hasAutoWaitcntBeforeBarrier() const {
     return AutoWaitcntBeforeBarrier;
   }
Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -72,6 +72,9 @@
   // We want to be able to turn these off, but making this a subtarget feature
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
+  //
+  // Similarly we want enable-prt-strict-null to be on by default and not to
+  // unset everything else if it is disabled
 
   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
 
@@ -87,6 +90,8 @@
     FullFS += "-fp32-denormals,";
   }
 
+  FullFS += "+enable-ptr-strict-null,"; // This is overridden by a disable in FS
+
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, FullFS);
@@ -174,6 +179,7 @@
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
     EnableDS128(false),
+    EnablePRTStrictNull(false),
     DumpCode(false),
 
     FP64(false),
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -810,6 +810,7 @@
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
+  addPass(createSIAddIMGInitPass());
   addPass(&SIFixSGPRCopiesID);
   return false;
 }
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -90,6 +90,7 @@
   R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
+  SIAddIMGInit.cpp
   SIAnnotateControlFlow.cpp
   SIDebuggerInsertNops.cpp
   SIFixSGPRCopies.cpp
Index: lib/Target/AMDGPU/MIMGInstructions.td
===================================================================
--- lib/Target/AMDGPU/MIMGInstructions.td
+++ lib/Target/AMDGPU/MIMGInstructions.td
@@ -163,6 +163,8 @@
     defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
   }
 }
 
@@ -395,6 +397,8 @@
     defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
   }
 }
 
@@ -413,6 +417,8 @@
     defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+    let VDataDwords = 8 in
+    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
   }
 }
 
Index: lib/Target/AMDGPU/SIAddIMGInit.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -0,0 +1,175 @@
+//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// Any MIMG instructions that use tfe or lwe require an initialization of the
+/// result register that will be written in the case of a memory access failure
+/// The required code is also added to tie this init code to the result of the
+/// img instruction
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-img-init"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPULaneDominator.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIAddIMGInit : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIAddIMGInit() : MachineFunctionPass(ID) {
+    initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Add IMG init"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE,
+                "SI Add IMG Init", false, false)
+
+char SIAddIMGInit::ID = 0;
+
+char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
+
+FunctionPass *llvm::createSIAddIMGInitPass() {
+  return new SIAddIMGInit();
+}
+
+bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
+  bool Changed = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      auto Opcode = MI.getOpcode();
+      if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore()) {
+        MachineOperand *tfe = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+        MachineOperand *lwe = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+        MachineOperand *d16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+        // Abandon attempts for instructions that don't have tfe or lwe fields
+        // Shouldn't be any at this point, but this will allow for future
+        // variants.
+        if (!tfe && !lwe)
+          continue;
+
+        unsigned tfeVal = tfe->getImm();
+        unsigned lweVal = lwe->getImm();
+        unsigned d16Val = d16 ? d16->getImm() : 0;
+
+        if (tfeVal || lweVal) {
+          // At least one of TFE or LWE are non-zero
+          // We have to insert a suitable initialization of the result value and
+          // tie this to the dest of the image instruction.
+
+          const DebugLoc &DL = MI.getDebugLoc();
+
+          int dstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                  AMDGPU::OpName::vdata);
+
+          // Calculate which dword we have to initialize to 0.
+          MachineOperand *MO_Dmask =
+            TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+          // Abandon attempt if no dmask operand is found.
+          if (!MO_Dmask) continue;
+
+          unsigned dmask = MO_Dmask->getImm();
+          // Determine the number of active lanes taking into account the
+          // Gather4 special case
+          unsigned activeLanes =
+            TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
+          // Subreg indices are counted from 1
+          // When D16 then we want next whole VGPR after write data.
+          bool Packed = !ST.hasUnpackedD16VMem();
+          unsigned initIdx =
+            d16Val && Packed ? ((activeLanes + 1) >> 1) + 1
+                             : activeLanes + 1;
+
+          // Abandon attempt if the dst size isn't large enough
+          // - this is in fact an error but this is picked up elsewhere and
+          // reported correctly.
+          uint32_t dstSize =
+            RI->getRegSizeInBits(*TII->getOpRegClass(MI, dstIdx)) / 32;
+          if (dstSize < initIdx) continue;
+
+          // Create a register for the intialization value.
+          unsigned prevDst =
+            MRI.createVirtualRegister(TII->getOpRegClass(MI, dstIdx));
+          BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), prevDst);
+
+          unsigned newDst = 0; // Final initialized value will be in here
+
+          // If PRTStrictNull feature is enabled (the default) then initialize
+          // all the result registers to 0, otherwise just the error indication
+          // register (VGPRn+1)
+          unsigned sizeLeft = ST.usePRTStrictNull() ? initIdx : 1;
+          unsigned currIdx = ST.usePRTStrictNull() ? 1 : initIdx;
+
+          for ( ; sizeLeft ; sizeLeft--, currIdx++ ) {
+            newDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, dstIdx));
+            // Initialize dword
+            unsigned subReg =
+                MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+            BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), subReg)
+              .addImm(0);
+            // Insert into the super-reg
+            BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), newDst)
+              .addReg(prevDst)
+              .addReg(subReg)
+              .addImm(currIdx);
+
+            prevDst = newDst;
+          }
+
+          // Add as an implicit operand
+          MachineInstrBuilder(MF,MI).addReg(newDst, RegState::Implicit);
+
+          // Tie the just added implicit operand to the dst
+          MI.tieOperands(dstIdx, MI.getNumOperands() - 1);
+
+          Changed = true;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -212,6 +212,7 @@
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@@ -4469,6 +4470,22 @@
   return Value == 0;
 }
 
+static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG,
+                         SDValue *TFE, SDValue *LWE) {
+  auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
+  if (!TexFailCtrlConst)
+    return false;
+
+  uint64_t Value = TexFailCtrlConst->getZExtValue();
+  SDLoc DL(TexFailCtrlConst);
+  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+  Value &= ~(uint64_t)0x1;
+  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+  Value &= ~(uint64_t)0x2;
+
+  return Value == 0;
+}
+
 SDValue SITargetLowering::lowerImage(SDValue Op,
                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
                                      SelectionDAG &DAG) const {
@@ -4532,7 +4549,16 @@
 
         IsD16 = true;
         if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
-          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
+          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 :
+                           (LoadVT == MVT::v4f16) ? MVT::v4i32 : MVT::v8i32;
+        else if (LoadVT.isVector() && LoadVT == MVT::v8f16)
+          // Rather than add lots of code to handle v8f16 for this case, just
+          // treat it as v4i32 - this is reasonable since this is only done for
+          // TFE/LWE support anyway and the result is a mixture of 4 packed
+          // 16 bit values and a 32 bit error condition (plus an unused 32 bit
+          // value) :
+          // [ Res0 : Res1 ][ Res2 : Res3 ][ ErrCode ][ unused ]
+          ResultTypes[0] = MVT::v4i32;
       }
 
       NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
@@ -4581,9 +4607,10 @@
     CtrlIdx = AddrIdx + NumVAddrs + 3;
   }
 
+  SDValue TFE;
+  SDValue LWE;
   SDValue TexFail = Op.getOperand(CtrlIdx);
-  auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
-  if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+  if (!parseTexFail(TexFail, DAG, &TFE, &LWE))
     return Op;
 
   SDValue GLC;
@@ -4609,8 +4636,8 @@
   Ops.push_back(GLC);
   Ops.push_back(SLC);
   Ops.push_back(False); // r128
-  Ops.push_back(False); // tfe
-  Ops.push_back(False); // lwe
+  Ops.push_back(TFE); // tfe
+  Ops.push_back(LWE); // lwe
   Ops.push_back(DimInfo->DA ? True : False);
   if (BaseOpcode->HasD16)
     Ops.push_back(IsD16 ? True : False);
@@ -7748,6 +7775,7 @@
   case AMDGPU::sub1: return 1;
   case AMDGPU::sub2: return 2;
   case AMDGPU::sub3: return 3;
+  case AMDGPU::sub4: return 4; // This might be returned when using TFE/LWE
   }
 }
 
@@ -7761,11 +7789,16 @@
   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
     return Node; // not implemented for D16
 
-  SDNode *Users[4] = { nullptr };
+  SDNode *Users[5] = { nullptr };
   unsigned Lane = 0;
   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
+  unsigned TFEIdx = DmaskIdx + 5;
+  unsigned LWEIdx = DmaskIdx + 6;
+  unsigned UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+                      Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
+  unsigned TFCLane = 0;
   bool HasChain = Node->getNumValues() > 1;
 
   if (OldDmask == 0) {
@@ -7773,6 +7806,12 @@
     return Node;
   }
 
+  // Work out which is the TFE/LWE lane if that is enabled.
+  if (UsesTFC) {
+    unsigned OldBitsSet = countPopulation(OldDmask);
+    TFCLane = OldBitsSet;
+  }
+
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
        I != E; ++I) {
@@ -7792,19 +7831,24 @@
     // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
 
-    // Set which texture component corresponds to the lane.
-    unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
-      Comp = countTrailingZeros(Dmask);
-      Dmask &= ~(1 << Comp);
-    }
+    // Check if the use is for the TFE/LWE generated result at VGPRn+1.
+    if (UsesTFC && Lane == TFCLane) {
+      Users[Lane] = *I;
+    } else {
+      // Set which texture component corresponds to the lane.
+      unsigned Comp;
+      for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+        Comp = countTrailingZeros(Dmask);
+        Dmask &= ~(1 << Comp);
+      }
 
-    // Abort if we have more than one user per component
-    if (Users[Lane])
-      return Node;
+      // Abort if we have more than one user per component.
+      if (Users[Lane])
+        return Node;
 
-    Users[Lane] = *I;
-    NewDmask |= 1 << Comp;
+      Users[Lane] = *I;
+      NewDmask |= 1 << Comp;
+    }
   }
 
   // Abort if there's no change
@@ -7813,7 +7857,13 @@
 
   unsigned BitsSet = countPopulation(NewDmask);
 
-  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
+  // Check for TFE or LWE - increase the number of channels by one to account
+  // for the extra return value
+  // This will need adjustment for D16 if this is also included in
+  // adjustWriteMask (this function) but at present D16 are excluded.
+  unsigned NewChannels = BitsSet + UsesTFC;
+
+  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
   assert(NewOpcode != -1 &&
          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
          "failed to find equivalent MIMG op");
@@ -7826,8 +7876,9 @@
 
   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
 
-  MVT ResultVT = BitsSet == 1 ?
-    SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+  MVT ResultVT = NewChannels == 1 ?
+    SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
+                           NewChannels == 5 ? 8 : NewChannels);
   SDVTList NewVTList = HasChain ?
     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
 
@@ -7841,7 +7892,7 @@
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
   }
 
-  if (BitsSet == 1) {
+  if (NewChannels == 1) {
     assert(Node->hasNUsesOfValue(1, 0));
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
                                       SDLoc(Node), Users[Lane]->getValueType(0),
@@ -7851,7 +7902,7 @@
   }
 
   // Update the users of the node with the new indices
-  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
     SDNode *User = Users[i];
     if (!User)
       continue;
@@ -7864,6 +7915,7 @@
     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+    case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
     }
   }
 
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2776,6 +2776,42 @@
     }
   }
 
+  // Verify MIMG
+  if (isMIMG(MI.getOpcode()) && !get(MI.getOpcode()).mayStore()) {
+    // Ensure that the return type used is large enough for all the options
+    // being used TFE/LWE require an extra result register.
+    const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
+    if (DMask) {
+      uint64_t DMaskImm = DMask->getImm();
+      uint32_t RegCount = isGather4(MI.getOpcode()) ? 4
+                                                    : countPopulation(DMaskImm);
+      const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
+      const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
+      const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
+      bool IsD16 = D16 ?  D16->getImm() : false;
+      
+      if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
+        RegCount += IsD16 ? 2 : 1;
+
+      // Adjust for D16 variants
+      bool Packed = !ST.hasUnpackedD16VMem();
+      if (IsD16 && Packed) RegCount = (RegCount + 1) >> 1;
+
+      const uint32_t DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                         AMDGPU::OpName::vdata);
+      const MachineOperand &Dst = MI.getOperand(DstIdx);
+      if (Dst.isReg()) {
+        const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
+        uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
+        if (RegCount > DstSize) {
+          ErrInfo =
+            "MIMG instruction returns too many registers for dst register class";
+          return false;
+        }
+      }
+    }
+  }
+
   // Verify VOP*. Ignore multiple sgpr operands on writelane.
   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -1,89 +1,222 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-ptr-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
 
 ; GCN-LABEL: {{^}}load_1d:
 ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_1d_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v5, s[0:7] dmask:0xf unorm tfe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v0, s[0:7] dmask:0xf unorm tfe{{$}}
+define amdgpu_ps <8 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.1d.v8f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  ret <8 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v5, s[0:7] dmask:0xf unorm lwe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v0, s[0:7] dmask:0xf unorm lwe{{$}}
+define amdgpu_ps <8 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.1d.v8f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_2d:
 ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2d_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:6], s[0:7] dmask:0xf unorm tfe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:1], s[0:7] dmask:0xf unorm tfe{{$}}
+define amdgpu_ps <8 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.2d.v8f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_3d:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_3d_tfe_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+define amdgpu_ps <8 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.3d.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_cube:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_cube_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm lwe da{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm lwe da{{$}}
+define amdgpu_ps <8 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.cube.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_1darray:
 ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_1darray_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:6], s[0:7] dmask:0xf unorm tfe da{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:1], s[0:7] dmask:0xf unorm tfe da{{$}}
+define amdgpu_ps <8 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.1darray.v8f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_2darray:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2darray_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm lwe da{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm lwe da{{$}}
+define amdgpu_ps <8 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.2darray.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_2dmsaa:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2dmsaa_both:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+define amdgpu_ps <8 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.2dmsaa.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_2darraymsaa:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe da{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe da{{$}}
+define amdgpu_ps <8 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.2darraymsaa.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_mip_1d:
 ; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_mip_1d_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load_mip v[0:7], v[5:6], s[0:7] dmask:0xf unorm lwe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load_mip v[0:7], v[0:1], s[0:7] dmask:0xf unorm lwe{{$}}
+define amdgpu_ps <8 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.mip.1d.v8f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_mip_2d:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_mip_2d_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: image_load_mip v[0:7], v[5:8], s[0:7] dmask:0xf unorm tfe{{$}}
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT: image_load_mip v[0:7], v[0:3], s[0:7] dmask:0xf unorm tfe{{$}}
+define amdgpu_ps <8 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.load.mip.2d.v8f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}load_mip_3d:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -92,6 +225,7 @@
 
 ; GCN-LABEL: {{^}}load_mip_cube:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -100,6 +234,7 @@
 
 ; GCN-LABEL: {{^}}load_mip_1darray:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -108,6 +243,7 @@
 
 ; GCN-LABEL: {{^}}load_mip_2darray:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -116,6 +252,7 @@
 
 ; GCN-LABEL: {{^}}store_1d:
 ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -124,6 +261,7 @@
 
 ; GCN-LABEL: {{^}}store_2d:
 ; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
 main_body:
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -132,6 +270,7 @@
 
 ; GCN-LABEL: {{^}}store_3d:
 ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) {
 main_body:
   call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@@ -140,6 +279,7 @@
 
 ; GCN-LABEL: {{^}}store_cube:
 ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
 main_body:
   call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -148,6 +288,7 @@
 
 ; GCN-LABEL: {{^}}store_1darray:
 ; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) {
 main_body:
   call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -156,6 +297,7 @@
 
 ; GCN-LABEL: {{^}}store_2darray:
 ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
 main_body:
   call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
@@ -164,6 +306,7 @@
 
 ; GCN-LABEL: {{^}}store_2dmsaa:
 ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -172,6 +315,7 @@
 
 ; GCN-LABEL: {{^}}store_2darraymsaa:
 ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -180,6 +324,7 @@
 
 ; GCN-LABEL: {{^}}store_mip_1d:
 ; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -188,6 +333,7 @@
 
 ; GCN-LABEL: {{^}}store_mip_2d:
 ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -196,6 +342,7 @@
 
 ; GCN-LABEL: {{^}}store_mip_3d:
 ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -204,6 +351,7 @@
 
 ; GCN-LABEL: {{^}}store_mip_cube:
 ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -212,6 +360,7 @@
 
 ; GCN-LABEL: {{^}}store_mip_1darray:
 ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -220,6 +369,7 @@
 
 ; GCN-LABEL: {{^}}store_mip_2darray:
 ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -228,6 +378,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_1d:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -236,6 +387,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_2d:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -244,6 +396,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_3d:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -252,6 +405,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_cube:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -260,6 +414,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_1darray:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -268,6 +423,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_2darray:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -276,6 +432,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_2dmsaa:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -284,6 +441,7 @@
 
 ; GCN-LABEL: {{^}}getresinfo_2darraymsaa:
 ; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
+; NOPRT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
@@ -292,6 +450,7 @@
 
 ; GCN-LABEL: {{^}}load_1d_V1:
 ; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}}
+; NOPRT: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}}
 define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -300,6 +459,7 @@
 
 ; GCN-LABEL: {{^}}load_1d_V2:
 ; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}}
+; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}}
 define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -308,6 +468,7 @@
 
 ; GCN-LABEL: {{^}}store_1d_V1:
 ; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}}
+; NOPRT: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}}
 define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) {
 main_body:
   call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -316,6 +477,7 @@
 
 ; GCN-LABEL: {{^}}store_1d_V2:
 ; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}}
+; NOPRT: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}}
 define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) {
 main_body:
   call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -324,6 +486,7 @@
 
 ; GCN-LABEL: {{^}}load_1d_glc:
 ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}}
+; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}}
 define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
@@ -332,6 +495,7 @@
 
 ; GCN-LABEL: {{^}}load_1d_slc:
 ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}}
+; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}}
 define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
@@ -340,6 +504,7 @@
 
 ; GCN-LABEL: {{^}}load_1d_glc_slc:
 ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}}
+; NOPRT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}}
 define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
@@ -348,6 +513,7 @@
 
 ; GCN-LABEL: {{^}}store_1d_glc:
 ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}}
+; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}}
 define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
@@ -356,6 +522,7 @@
 
 ; GCN-LABEL: {{^}}store_1d_slc:
 ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}}
+; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}}
 define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
@@ -364,6 +531,7 @@
 
 ; GCN-LABEL: {{^}}store_1d_glc_slc:
 ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}}
+; NOPRT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}}
 define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
 main_body:
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
@@ -404,23 +572,33 @@
   store float 0.000000e+00, float addrspace(3)* %lds
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
-  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
   store float 0.000000e+00, float addrspace(3)* %tmp2
   ret float %tex
 }
 
 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.1d.v8f32.i32(i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.2d.v8f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.3d.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.cube.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.1darray.v8f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.2darray.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.2dmsaa.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.2darraymsaa.v8f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.mip.1d.v8f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.load.mip.2d.v8f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -10,6 +10,17 @@
   ret half %tex
 }
 
+; GCN-LABEL: {{^}}image_sample_2d_f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; PACKED: image_sample v[{{[0-9]+:[0-9]+}}], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
+; UNPACKED: image_sample v[{{[0-9]+:[0-9]+}}], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
+define amdgpu_ps <2 x float> @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+main_body:
+  %tex = call <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %r = bitcast <4 x half> %tex to <2 x float>
+  ret <2 x float> %r
+}
+
 ; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16:
 ; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
 ; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
@@ -20,6 +31,17 @@
   ret float %r
 }
 
+; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
+; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
+define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+main_body:
+  %tex = call <4 x half> @llvm.amdgcn.image.sample.c.d.1d.v4f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %r = bitcast <4 x half> %tex to <2 x float>
+  ret <2 x float> %r
+}
+
 ; GCN-LABEL: {{^}}image_sample_b_2d_v4f16:
 ; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
 ; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
@@ -30,9 +52,23 @@
   ret <2 x float> %r
 }
 
+; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
+; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
+define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+main_body:
+  %tex = call <8 x half> @llvm.amdgcn.image.sample.b.2d.v8f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %r = bitcast <8 x half> %tex to <4 x float>
+  ret <4 x float> %r
+}
+
 declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.image.sample.c.d.1d.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <8 x half> @llvm.amdgcn.image.sample.b.2d.v8f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -9,6 +9,114 @@
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}sample_1d_tfe:
+; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
+define amdgpu_ps <8 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  ret <8 x float> %v
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1:
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f = extractelement <8 x float> %v, i32 0
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2:
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f = extractelement <8 x float> %v, i32 1
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3:
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f = extractelement <8 x float> %v, i32 2
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4:
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f = extractelement <8 x float> %v, i32 3
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.err, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12:
+; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f1 = extractelement <8 x float> %v, i32 0
+  %res.f2 = extractelement <8 x float> %v, i32 1
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res = insertelement <4 x float> %res.tmp2, float %res.err, i32 2
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24:
+; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f1 = extractelement <8 x float> %v, i32 1
+  %res.f2 = extractelement <8 x float> %v, i32 3
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res = insertelement <4 x float> %res.tmp2, float %res.err, i32 2
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134:
+; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.f1 = extractelement <8 x float> %v, i32 0
+  %res.f2 = extractelement <8 x float> %v, i32 2
+  %res.f3 = extractelement <8 x float> %v, i32 3
+  %res.err = extractelement <8 x float> %v, i32 4
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2
+  %res = insertelement <4 x float> %res.tmp3, float %res.err, i32 3
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_lwe:
+; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
+define amdgpu_ps <8 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
+  ret <8 x float> %v
+}
+
 ; GCN-LABEL: {{^}}sample_2d:
 ; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}}
 define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
@@ -491,6 +599,7 @@
 }
 
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <8 x float> @llvm.amdgcn.image.sample.1d.v8f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1