diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -58,12 +58,14 @@
 
 // Check if a value can be converted to a 16-bit value without losing
 // precision.
-static bool canSafelyConvertTo16Bit(Value &V) {
+static bool areAllDefs16Bit(Value &V) {
   Type *VTy = V.getType();
-  if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
+  Type *STy = VTy->getScalarType();
+  if (STy->isHalfTy() || STy->isIntegerTy(16)) {
     // The value is already 16-bit, so we don't want to convert to 16-bit again!
     return false;
   }
+
   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
     // We need to check that if we cast the index down to a half, we do not lose
     // precision.
@@ -72,11 +74,22 @@
     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
     return !LosesInfo;
   }
+
+  if (VTy->isVectorTy()) {
+    if (auto *ConstVec = dyn_cast<Constant>(&V)) {
+      for (auto &Part : ConstVec->operands()) {
+        if (!areAllDefs16Bit(*Part))
+          return false;
+      }
+      return true;
+    }
+  }
+
   Value *CastSrc;
   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
-    Type *CastSrcTy = CastSrc->getType();
+    Type *CastSrcTy = CastSrc->getType()->getScalarType();
     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
       return true;
   }
@@ -89,14 +102,37 @@
   Type *VTy = V.getType();
   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
     return cast<Instruction>(&V)->getOperand(0);
-  if (VTy->isIntegerTy())
-    return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
-  if (VTy->isFloatingPointTy())
-    return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
+  auto *NewScalarTy =
+      VTy->isFPOrFPVectorTy() ? Builder.getHalfTy() : Builder.getInt16Ty();
+  Type *NewTy;
+  if (auto *VectorTy = dyn_cast<VectorType>(VTy))
+    NewTy = VectorType::get(NewScalarTy, VectorTy->getElementCount());
+  else
+    NewTy = NewScalarTy;
+
+  if (VTy->isIntOrIntVectorTy())
+    return Builder.CreateTrunc(&V, NewTy);
+  if (VTy->isFPOrFPVectorTy())
+    return Builder.CreateFPTrunc(&V, NewTy);
 
   llvm_unreachable("Should never be called!");
 }
 
+/// Check if all uses of a value only need 16-bit precision.
+static bool areAllUses16Bit(Value &V) {
+  for (auto *Use : V.users()) {
+    Value *CastSrc;
+    if (match(Use, m_FPTrunc(PatternMatch::m_Value(CastSrc))) ||
+        match(Use, m_Trunc(PatternMatch::m_Value(CastSrc)))) {
+      Type *CastDestTy = Use->getType()->getScalarType();
+      if (CastDestTy->isHalfTy() || CastDestTy->isIntegerTy(16))
+        continue;
+    }
+    return false;
+  }
+  return true;
+}
+
 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
 /// the modified arguments.
 static Optional<Instruction *> modifyIntrinsicCall(
@@ -182,6 +218,60 @@
     }
   }
 
+  // Try to use D16
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+      AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+  if (ST->hasD16LoadStore() && BaseInfo->HasD16) {
+    Type *DataTy;
+    if (BaseInfo->Store)
+      DataTy = II.getFunctionType()->getParamType(0);
+    else
+      DataTy = II.getType();
+    auto *ScalarTy = DataTy->getScalarType();
+
+    if ((ScalarTy->isFloatTy() || ScalarTy->isIntegerTy(32))) {
+      auto *NewScalarTy = ScalarTy->isFloatTy() ? IC.Builder.getHalfTy()
+                                                : IC.Builder.getInt16Ty();
+      Type *NewDataTy;
+      if (auto *VTy = dyn_cast<VectorType>(DataTy))
+        NewDataTy = VectorType::get(NewScalarTy, VTy->getElementCount());
+      else
+        NewDataTy = NewScalarTy;
+
+      if (BaseInfo->Store) {
+        if (areAllDefs16Bit(*II.getArgOperand(0))) {
+          return modifyIntrinsicCall(
+              II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
+                ArgTys[0] = NewDataTy;
+                Args[0] = convertTo16Bit(*Args[0], IC.Builder);
+              });
+        }
+      } else {
+        if (areAllUses16Bit(II)) {
+          SmallVector<Type *, 4> ArgTys;
+          if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+            return None;
+
+          SmallVector<Value *, 8> Args(II.args());
+
+          ArgTys[0] = NewDataTy;
+
+          Function *I = Intrinsic::getDeclaration(II.getModule(),
+                                                  II.getIntrinsicID(), ArgTys);
+
+          CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+          NewCall->takeName(&II);
+          NewCall->copyMetadata(II);
+          if (isa<FPMathOperator>(NewCall))
+            NewCall->copyFastMathFlags(&II);
+
+          auto *NewValue = IC.Builder.CreateFPExt(NewCall, DataTy);
+          return IC.replaceInstUsesWith(II, NewValue);
+        }
+      }
+    }
+  }
+
   // Try to use A16 or G16
   if (!ST->hasA16() && !ST->hasG16())
     return None;
@@ -194,7 +284,7 @@
        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
     Value *Coord = II.getOperand(OperandIndex);
     // If the values are not derived from 16-bit values, we cannot optimize.
-    if (!canSafelyConvertTo16Bit(*Coord)) {
+    if (!areAllDefs16Bit(*Coord)) {
       if (OperandIndex < ImageDimIntr->CoordStart ||
           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
         return None;
@@ -215,7 +305,7 @@
   // Check if there is a bias parameter and if it can be converted to f16
   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
-    if (!canSafelyConvertTo16Bit(*Bias))
+    if (!areAllDefs16Bit(*Bias))
       OnlyDerivatives = true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -38,12 +38,6 @@
 };
 const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr);
 
-struct D16ImageDimIntrinsic {
-  unsigned Intr;
-  unsigned D16HelperIntr;
-};
-const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr);
-
 struct ImageDimIntrinsicInfo {
   unsigned Intr;
   unsigned BaseOpcode;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -43,11 +43,10 @@
 class AAResults;
 
 namespace AMDGPU {
-#define GET_D16ImageDimIntrinsics_IMPL
 #define GET_ImageDimIntrinsicTable_IMPL
 #define GET_RsrcIntrinsics_IMPL
 #include "AMDGPUGenSearchableTables.inc"
-}
+} // namespace AMDGPU
 }
 
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2800,6 +2800,80 @@
   ret void
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample d16
+; --------------------------------------------------------------------
+
+define amdgpu_kernel void @image_sample_d16_1d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; CHECK-LABEL: @image_sample_d16_1d(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32(i32 15, float [[S:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res16 = fptrunc <4 x float> %res to <4 x half>
+  store <4 x half> %res16, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_gather_d16_2d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+; CHECK-LABEL: @image_gather_d16_2d(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  %res16 = fptrunc <4 x float> %res to <4 x half>
+  store <4 x half> %res16, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_load_d16_2d(<4 x half> addrspace(1)* %out, <8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; CHECK-LABEL: @image_load_d16_2d(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    store <4 x half> [[RES]], <4 x half> addrspace(1)* [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %res16 = fptrunc <4 x float> %res to <4 x half>
+  store <4 x half> %res16, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @image_store_d16_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
+; CHECK-LABEL: @image_store_d16_2d(
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.f16.i32(half [[DATA:%.*]], i32 1, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %data32 = fpext half %data to float
+  call void @llvm.amdgcn.image.store.2d.f32.i32(float %data32, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @image_store_d16_2d_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %data) {
+; CHECK-LABEL: @image_store_d16_2d_v4f32(
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> [[DATA:%.*]], i32 1, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  %data32 = fpext <4 x half> %data to <4 x float>
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %data32, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @image_store_d16_2d_const_v4f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; CHECK-LABEL: @image_store_d16_2d_const_v4f32(
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> <half 0xH0000, half 0xH3C00, half 0xH4000, half 0xH4200>, i32 1, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> < float 0.0, float 1.0, float 2.0, float 3.0 >, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32)
+declare void @llvm.amdgcn.image.store.2d.f32.i32(float, i32, i32, i32, <8 x i32>, i32, i32)
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32)
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.image.sample a16
 ; --------------------------------------------------------------------