diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -369,6 +369,12 @@
   "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands"
 >;
 
+def FeatureG16 : SubtargetFeature<"g16",
+  "HasG16",
+  "true",
+  "Support G16 for 16-bit gradient image operands"
+>;
+
 def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
   "HasNSAEncoding",
   "true",
@@ -689,7 +695,7 @@
    FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
    FeatureVOP3Literal, FeatureDPP8,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC,
-   FeatureGFX10A16
+   FeatureGFX10A16, FeatureG16
   ]
 >;
 
@@ -1104,6 +1110,9 @@
 def HasGFX10A16 : Predicate<"Subtarget->hasGFX10A16()">,
   AssemblerPredicate<(all_of FeatureGFX10A16)>;
 
+def HasG16 : Predicate<"Subtarget->hasG16()">,
+  AssemblerPredicate<(all_of FeatureG16)>;
+
 def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
   AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -351,6 +351,7 @@
   bool HasDPP8;
   bool HasR128A16;
   bool HasGFX10A16;
+  bool HasG16;
   bool HasNSAEncoding;
   bool HasDLInsts;
   bool HasDot1Insts;
@@ -1005,6 +1006,10 @@
     return HasGFX10A16;
   }
 
+  bool hasG16() const {
+    return HasG16;
+  }
+
   bool hasOffset3fBug() const {
     return HasOffset3fBug;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -244,6 +244,7 @@
     HasDPP8(false),
     HasR128A16(false),
     HasGFX10A16(false),
+    HasG16(false),
     HasNSAEncoding(false),
     HasDLInsts(false),
     HasDot1Insts(false),
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -35,6 +35,7 @@
   bit Gather4 = 0;
   bits<8> NumExtraArgs = 0;
   bit Gradients = 0;
+  bit G16 = 0;
   bit Coordinates = 1;
   bit LodOrClampOrMip = 0;
   bit HasD16 = 0;
@@ -47,9 +48,9 @@
 def MIMGBaseOpcodesTable : GenericTable {
   let FilterClass = "MIMGBaseOpcode";
   let CppTypeName = "MIMGBaseOpcodeInfo";
-  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
-                "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
-                "HasD16"];
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+                "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
+                "LodOrClampOrMip", "HasD16"];
   GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
 
   let PrimaryKey = ["BaseOpcode"];
@@ -117,6 +118,22 @@
   let PrimaryKeyName = "getMIMGMIPMappingInfo";
 }
 
+class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> {
+  MIMGBaseOpcode G = g;
+  MIMGBaseOpcode G16 = g16;
+}
+
+def MIMGG16MappingTable : GenericTable {
+  let FilterClass = "MIMGG16Mapping";
+  let CppTypeName = "MIMGG16MappingInfo";
+  let Fields = ["G", "G16"];
+  GenericEnum TypeOf_G = MIMGBaseOpcode;
+  GenericEnum TypeOf_G16 = MIMGBaseOpcode;
+
+  let PrimaryKey = ["G"];
+  let PrimaryKeyName = "getMIMGG16MappingInfo";
+}
+
 class MIMG_Base <dag outs, string dns = "">
   : InstSI <outs, (ins), "", []> {
 
@@ -646,10 +663,11 @@
 }
 
 multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
-                         bit isGetLod = 0,
-                         string asm = "image_sample"#sample.LowerCaseMod> {
+                         bit isG16 = 0, bit isGetLod = 0,
+                         string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = !if(isGetLod, 0, 1);
+    let G16 = isG16;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -726,73 +744,89 @@
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
 //} // End let FPAtomic = 1
-defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
-defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
-
-defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
-
-defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL            : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D             : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL          : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_D_G16         : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16      : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_L             : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B             : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL          : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ            : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C             : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL          : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D           : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL        : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_D_G16       : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16    : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_L           : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B           : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL        : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ          : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O             : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O          : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O           : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O        : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_D_O_G16       : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16    : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_L_O           : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O           : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O        : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O          : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O           : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O        : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O         : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O      : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_D_O_G16     : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16  : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_L_O         : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O      : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O         : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O        : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4              : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL           : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L            : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B            : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL         : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ           : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C            : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL         : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L          : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B          : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL       : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ         : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O            : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O         : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O          : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O          : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O       : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O         : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O          : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O       : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O        : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O        : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O     : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O       : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD              : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD            : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL         : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD          : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL       : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O          : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O       : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O        : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O     : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE_CD_G16        : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16     : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16      : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16   : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
 
@@ -835,3 +869,21 @@
 // MIP to NONMIP Optimization Mapping
 def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
 def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
+
+// G to G16 Optimization Mapping
+def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D, IMAGE_SAMPLE_C_D_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL, IMAGE_SAMPLE_C_D_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD, IMAGE_SAMPLE_CD_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL, IMAGE_SAMPLE_CD_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD, IMAGE_SAMPLE_C_CD_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL, IMAGE_SAMPLE_C_CD_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL_O_G16>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5310,6 +5310,7 @@
   SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
   SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
   bool IsD16 = false;
+  bool IsG16 = false;
   bool IsA16 = false;
   SDValue VData;
   int NumVDataDwords;
@@ -5416,45 +5417,100 @@
     }
   }
 
-  // Check for 16 bit addresses and pack if true.
+  // Push back extra arguments.
+  for (unsigned i = 0; i < BaseOpcode->NumExtraArgs; i++)
+    VAddrs.push_back(Op.getOperand(AddrIdx + i));
+
+  // Check for 16 bit derivatives and pack if true.
   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
-  const MVT VAddrScalarVT = VAddrVT.getScalarType();
+  MVT VAddrScalarVT = VAddrVT.getScalarType();
+  MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+  if (BaseOpcode->Gradients && (VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16)) {
+    IsG16 = true;
+
+    // 1D: undef,dx/dh; undef,dx/dv
+    // 2D: dy/dh,dx/dh; dy/dv,dx/dv
+    // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
+    for (unsigned i = 0; i < 2; i++) {
+      unsigned Idx = DimIdx + i * (NumGradients / 2);
+      SDValue Pack;
+      if ((NumGradients / 2) >= 2) {
+        Pack = DAG.getBuildVector(PackVectorVT, DL,
+                                    {Op.getOperand(Idx), Op.getOperand(Idx + 1)});
+        Pack = DAG.getBitcast(MVT::f32, Pack);
+        VAddrs.push_back(Pack);
+      }
+      if ((NumGradients / 2) % 2 != 0) {
+        // Odd, either 1D or 3D
+        Pack = Op.getOperand(Idx + NumGradients / 2 - 1);
+        if (Pack.getValueType() != MVT::i16)
+          Pack = DAG.getBitcast(MVT::i16, Pack);
+        Pack = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Pack);
+        Pack = DAG.getBitcast(MVT::f32, Pack);
+        VAddrs.push_back(Pack);
+      }
+    }
+  } else {
+    for (unsigned i = 0; i < NumGradients; i++)
+      VAddrs.push_back(Op.getOperand(DimIdx + i));
+  }
+
+  // Check for 16 bit addresses and pack if true.
+  unsigned CoordIdx = DimIdx + NumGradients;
+  unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
+  VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
+  VAddrScalarVT = VAddrVT.getScalarType();
+  PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
   if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16))) {
     // Illegal to use a16 images
-    if (!ST->hasFeature(AMDGPU::FeatureR128A16) && !ST->hasFeature(AMDGPU::FeatureGFX10A16))
+    if (!ST->hasFeature(AMDGPU::FeatureR128A16) && !ST->hasFeature(AMDGPU::FeatureGFX10A16)) {
+      LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+          "support 16 bit addresses\n");
       return Op;
+    }
+
+    // Illegal to use 16 bit addresses with 32 bit derivatives
+    if (BaseOpcode->Gradients && !IsG16) {
+      LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
+          "need 16 bit derivatives but got 32 bit derivatives\n");
+      return Op;
+    }
+    IsG16 = false;
 
     IsA16 = true;
-    const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
-    for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
-      SDValue AddrLo;
-      // Push back extra arguments.
-      if (i < DimIdx) {
-        AddrLo = Op.getOperand(i);
+    for (unsigned i = CoordIdx; i < CoordsEnd; i++) {
+      SDValue Pack;
+      if ((i + 1) < CoordsEnd) {
+        Pack = DAG.getBuildVector(PackVectorVT, DL,
+                                  {Op.getOperand(i),
+                                   Op.getOperand(i + 1)});
+        i++;
       } else {
-        // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
-        // in 1D, derivatives dx/dh and dx/dv are packed with undef.
-        if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
-            ((NumGradients / 2) % 2 == 1 &&
-            (i == DimIdx + (NumGradients / 2) - 1 ||
-             i == DimIdx + NumGradients - 1))) {
-          AddrLo = Op.getOperand(i);
-          if (AddrLo.getValueType() != MVT::i16)
-            AddrLo = DAG.getBitcast(MVT::i16, Op.getOperand(i));
-          AddrLo = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, AddrLo);
-        } else {
-          AddrLo = DAG.getBuildVector(VectorVT, DL,
-                                      {Op.getOperand(i), Op.getOperand(i + 1)});
-          i++;
-        }
-        AddrLo = DAG.getBitcast(MVT::f32, AddrLo);
+        Pack = Op.getOperand(i);
+        if (Pack.getValueType() != MVT::i16)
+          Pack = DAG.getBitcast(MVT::i16, Pack);
+        Pack = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Pack);
       }
-      VAddrs.push_back(AddrLo);
+      Pack = DAG.getBitcast(MVT::f32, Pack);
+      VAddrs.push_back(Pack);
     }
   } else {
-    for (unsigned i = 0; i < NumMIVAddrs; ++i)
-      VAddrs.push_back(Op.getOperand(AddrIdx + i));
+    for (unsigned i = CoordIdx; i < CoordsEnd; i++)
+      VAddrs.push_back(Op.getOperand(i));
+  }
+
+  // Illegal to use g16 images without 16 bit addresses on target that do not
+  // support it.
+  if (IsG16) {
+    if (!ST->hasFeature(AMDGPU::FeatureG16)) {
+      LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+          "support 16 bit derivatives\n");
+      return Op;
+    }
+    const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+        AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+    IntrOpcode = G16MappingInfo->G16;  // set new opcode to variant with _g16
   }
 
   // If the register allocator cannot place the address registers contiguously
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,6 +63,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "si-instr-info"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenInstrInfo.inc"
 
@@ -3754,7 +3756,7 @@
         IsA16 = A16->getImm() != 0;
       }
 
-      bool PackDerivatives = IsA16; // Either A16 or G16
+      bool PackDerivatives = IsA16 || BaseOpcode->G16;
       bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
 
       unsigned AddrWords = BaseOpcode->NumExtraArgs;
@@ -3791,6 +3793,8 @@
       }
 
       if (VAddrWords != AddrWords) {
+        LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
+            << " but got " << VAddrWords << "\n");
         ErrInfo = "bad vaddr size";
         return false;
       }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -211,6 +211,7 @@
 
   uint8_t NumExtraArgs;
   bool Gradients;
+  bool G16;
   bool Coordinates;
   bool LodOrClampOrMip;
   bool HasD16;
@@ -247,11 +248,19 @@
   MIMGBaseOpcode NONMIP;
 };
 
+struct MIMGG16MappingInfo {
+  MIMGBaseOpcode G;
+  MIMGBaseOpcode G16;
+};
+
 LLVM_READONLY
 const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
 
 LLVM_READONLY
-const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L);
+const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
+
+LLVM_READONLY
+const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G);
 
 LLVM_READONLY
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -552,6 +561,7 @@
 bool hasSRAMECC(const MCSubtargetInfo &STI);
 bool hasMIMG_R128(const MCSubtargetInfo &STI);
 bool hasGFX10A16(const MCSubtargetInfo &STI);
+bool hasG16(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 
 bool isSI(const MCSubtargetInfo &STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -108,6 +108,7 @@
 #define GET_MIMGInfoTable_IMPL
 #define GET_MIMGLZMappingTable_IMPL
 #define GET_MIMGMIPMappingTable_IMPL
+#define GET_MIMGG16MappingTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -934,6 +935,10 @@
   return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16];
 }
 
+bool hasG16(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureG16];
+}
+
 bool hasPackedD16(const MCSubtargetInfo &STI) {
   return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -0,0 +1,311 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
+
+define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_d_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_d_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
+; GFX10-LABEL: sample_d_3d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v9, v3 ; encoding: [0x09,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v9, v0 ; encoding: [0x09,0x01,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x15,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x03,0x05,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_c_d_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_c_d_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
+; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_d_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_d_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_c_d_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_c_d_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_cd_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_cd_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_c_cd_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_c_cd_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff ; encoding: [0xff,0x02,0x14,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3 ; encoding: [0x0a,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1 ; encoding: [0x0a,0x03,0x02,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
+; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_cd_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_cd_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_c_cd_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_c_cd_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff ; encoding: [0xff,0x02,0x10,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3 ; encoding: [0x08,0x07,0x06,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1 ; encoding: [0x08,0x03,0x02,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
+; GFX10-LABEL: sample_c_d_o_2darray_V1:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret float %v
+}
+
+define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
+; GFX10-LABEL: sample_c_d_o_2darray_V2:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff ; encoding: [0xff,0x02,0x12,0x7e,0xff,0xff,0x00,0x00]
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4 ; encoding: [0x09,0x09,0x08,0x36]
+; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2 ; encoding: [0x09,0x05,0x04,0x36]
+; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04]
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -0,0 +1,311 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+
+define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_d_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_d_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) {
+; GFX10-LABEL: sample_d_3d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v9, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, v9, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_c_d_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_c_d_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_d_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_d_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_c_d_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_c_d_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_cd_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_cd_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
+; GFX10-NEXT:    image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
+; GFX10-LABEL: sample_c_cd_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
+; GFX10-LABEL: sample_c_cd_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v10, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v10, v1
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_cd_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_cd_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v0, v7, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v7, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    v_lshl_or_b32 v3, v3, 16, v2
+; GFX10-NEXT:    image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
+; GFX10-LABEL: sample_c_cd_cl_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
+; GFX10-LABEL: sample_c_cd_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v8, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, v8, v1
+; GFX10-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX10-NEXT:    image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
+; GFX10-LABEL: sample_c_d_o_2darray_V1:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
+; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret float %v
+}
+
+define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) {
+; GFX10-LABEL: sample_c_d_o_2darray_V2:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0xffff
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v4, v9, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, v9, v2
+; GFX10-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX10-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32, half, half, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32, float, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32, float, half, half, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i32, float, half, half, half, half, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-image-g16.ll
@@ -0,0 +1,17 @@
+; RUN: not --crash llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+; Make sure this doesn't assert on targets without the g16 feature, and instead
+; generates a selection error.
+
+; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.image.sample.d.1d
+
+define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+attributes #0 = { nounwind readonly }