Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -87,6 +87,7 @@
   SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -574,6 +574,7 @@
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::MULHU);
   setTargetDAGCombine(ISD::MULHS);
@@ -3119,6 +3120,32 @@
   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
 }
 
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+
+  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+  if (Src.getOpcode() == ISD::BITCAST) {
+    EVT SrcVT = Src.getValueType();
+    SDValue Vec = Src.getOperand(0);
+    if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt0 = Vec.getOperand(0);
+      EVT EltVT = Elt0.getValueType();
+      if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+        if (EltVT.isFloatingPoint())
+          return DAG.getNode(ISD::BITCAST, SL, VT, Elt0);
+
+        return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 // We need to specifically handle i64 mul here to avoid unnecessary conversion
 // instructions. If we only match on the legalized i64 mul expansion,
 // SimplifyDemandedBits will be unable to remove them because there will be
@@ -3758,6 +3785,8 @@
 
     return performSraCombine(N, DCI);
   }
+  case ISD::TRUNCATE:
+    return performTruncateCombine(N, DCI);
   case ISD::MUL:
     return performMulCombine(N, DCI);
   case ISD::MULHS:
Index: test/CodeGen/AMDGPU/function-returns.ll
===================================================================
--- test/CodeGen/AMDGPU/function-returns.ll
+++ test/CodeGen/AMDGPU/function-returns.ll
@@ -282,7 +282,7 @@
 }
 
 ; GCN-LABEL: {{^}}v3i16_func_void:
-; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9: v_lshrrev_b32
 ; GFX9: s_setpc_b64
@@ -304,9 +304,8 @@
 ; GCN-LABEL: {{^}}v5i16_func_void:
 ; GFX9: buffer_load_dwordx2 v[0:1]
 ; GFX9: buffer_load_ushort v4
+; GFX9: v_lshrrev_b32_e32 v5, 16, v0
 ; GFX9: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9: v_mov_b32_e32 v2, v1
-; GFX9: v_lshrrev_b32_e32 v1, 16, v0
 ; GCN: s_setpc_b64
 define <5 x i16> @v5i16_func_void() #0 {
   %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll
@@ -8,11 +8,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -26,11 +26,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -44,11 +44,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -62,11 +62,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -80,11 +80,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll
@@ -41,11 +41,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -59,11 +59,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -77,11 +77,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
 
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -95,10 +95,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
@@ -112,10 +113,11 @@
 ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]]
 
 ; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16
-; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 
+; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]]
 ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]]
-; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off
+
+; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off
 define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) {
 main_body:
   %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
Index: test/CodeGen/AMDGPU/store-weird-sizes.ll
===================================================================
--- test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -3,18 +3,28 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}local_store_i56:
-; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
+; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+
 define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
   store i56 %arg, i56 addrspace(3)* %ptr, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}local_store_i55:
-; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
-; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
-; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6
+; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}}
+
+; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6
+; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4
+; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}}
 define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
   store i55 %arg, i55 addrspace(3)* %ptr, align 8
   ret void
Index: test/CodeGen/AMDGPU/trunc-combine.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/trunc-combine.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+
+; Make sure high constant 0 isn't pointlessly materialized
+; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i16:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64
+define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
+  %srl = lshr i64 %bar, 32
+  %trunc = trunc i64 %srl to i16
+  ret i16 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i32:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64
+define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) {
+  %srl = lshr i64 %bar, 32
+  %trunc = trunc i64 %srl to i32
+  ret i32 %trunc
+}
+
+; GCN-LABEL: {{^}}trunc_bitcast_v2i32_to_i16:
+; GCN: _load_dword
+; GCN-NOT: _load_dword
+; GCN-NOT: v_mov_b32
+; GCN: v_add_u32_e32 v0, vcc, 4, v0
+define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
+  %load0 = load i32, i32 addrspace(1)* undef
+  %load1 = load i32, i32 addrspace(1)* null
+  %insert.0 = insertelement <2 x i32> undef, i32 %load0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 99, i32 1
+  %bc = bitcast <2 x i32> %insert.1 to i64
+  %trunc = trunc i64 %bc to i16
+  %add = add i16 %trunc, 4
+  ret i16 %add
+}
+
+; Make sure there's no crash if the source vector type is FP
+; GCN-LABEL: {{^}}trunc_bitcast_v2f32_to_i16:
+; GCN: _load_dword
+; GCN-NOT: _load_dword
+; GCN-NOT: v_mov_b32
+; GCN: v_add_u32_e32 v0, vcc, 4, v0
+define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
+  %load0 = load float, float addrspace(1)* undef
+  %load1 = load float, float addrspace(1)* null
+  %insert.0 = insertelement <2 x float> undef, float %load0, i32 0
+  %insert.1 = insertelement <2 x float> %insert.0, float 4.0, i32 1
+  %bc = bitcast <2 x float> %insert.1 to i64
+  %trunc = trunc i64 %bc to i16
+  %add = add i16 %trunc, 4
+  ret i16 %add
+}