Index: lib/Target/AMDGPU/MIMGInstructions.td
===================================================================
--- lib/Target/AMDGPU/MIMGInstructions.td
+++ lib/Target/AMDGPU/MIMGInstructions.td
@@ -179,8 +179,8 @@
     defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
   }
 }
 
@@ -413,8 +413,8 @@
     defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
   }
 }
 
@@ -434,8 +434,8 @@
     defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
   }
 }
 
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4728,14 +4728,14 @@
   EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
                            : AdjEltVT;
 
-  // Special case for v8f16. Rather than add support for this, use v4i32 to
+  // Special case for v6f16. Rather than add support for this, use v3i32 to
   // extract the data elements
-  bool V8F16Special = false;
-  if (CastVT == MVT::v8f16) {
-    CastVT = MVT::v4i32;
+  bool V6F16Special = false;
+  if (NumElts == 6) {
+    CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
     DMaskPop >>= 1;
     ReqRetNumElts >>= 1;
-    V8F16Special = true;
+    V6F16Special = true;
     AdjVT = MVT::v2i32;
   }
 
@@ -4765,7 +4765,7 @@
     PreTFCRes = BVElts[0];
   }
 
-  if (V8F16Special)
+  if (V6F16Special)
     PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
 
   if (!IsTexFail) {
@@ -5004,9 +5004,6 @@
       return Undef;
     }
 
-    // Have to use a power of 2 number of dwords
-    NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
-
     EVT NewVT = NumVDataDwords > 1 ?
                   EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
                 : MVT::f32;
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -22,7 +22,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
+; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
@@ -45,7 +45,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
+; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
@@ -76,7 +76,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
@@ -107,7 +107,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
@@ -138,7 +138,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
@@ -169,7 +169,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
@@ -200,7 +200,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
@@ -231,7 +231,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
@@ -262,7 +262,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
@@ -293,7 +293,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
+; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
@@ -324,7 +324,7 @@
 ; NOPRT-NOT: v_mov_b32_e32 v1
 ; NOPRT-NOT: v_mov_b32_e32 v2
 ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
 ; SIVI: buffer_store_dword v4, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
 define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
@@ -451,7 +451,7 @@
 ; NOPRT: v_mov_b32_e32 v2, 0
 ; NOPRT-NOT: v_mov_b32_e32 v0
 ; NOPRT-NOT: v_mov_b32_e32 v1
-; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
+; GCN: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
 ; SIVI: buffer_store_dword v2, off, s[8:11], 0
 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2
 define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -19,7 +19,7 @@
 }
 
 ; GCN-LABEL: {{^}}load.v3f32.1d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16
 define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -57,7 +57,7 @@
 }
 
 ; GCN-LABEL: {{^}}load.v3f32.2d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16
 define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -99,7 +99,7 @@
 }
 
 ; GCN-LABEL: {{^}}load.v3f32.3d:
-; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16
+; GCN: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm a16
 define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -15,7 +15,7 @@
 ; GCN: v_mov_b32_e32 v2, v0
 ; GCN: v_mov_b32_e32 v3, v0
 ; GCN: v_mov_b32_e32 v4, v0
-; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
+; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
 define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -155,7 +155,7 @@
 ; GCN: v_mov_b32_e32 v2, v0
 ; GCN: v_mov_b32_e32 v3, v0
 ; GCN: v_mov_b32_e32 v4, v0
-; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
+; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
 define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
@@ -537,7 +537,7 @@
 }
 
 ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe:
-; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
+; GCN: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
 define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
 main_body:
   %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)