Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -327,6 +327,8 @@
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
@@ -1384,6 +1386,14 @@
   SmallVector<SDValue, 8> Args;
   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   EVT VT = Op.getValueType();
+  EVT SrcVT = Op.getOperand(0).getValueType();
+
+  // For these types, we have some TableGen patterns except if the index is 1
+  if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
+       (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
+      Start != 1)
+    return Op;
+
   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
                             VT.getVectorNumElements());
 
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format2.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format2.ll
@@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+
+@esgs_ring = external addrspace(3) global [0 x i32], align 65536
+
+define amdgpu_gs <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> @main(<4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %0, <8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %6, i32 inreg %7, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %8, <8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %9, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %10, <8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, <4 x i32> inreg %20, <4 x i32> inreg %21, <4 x i32> inreg %22, <4 x i32> inreg %23, <4 x i32> inreg %24, i32 %25, i32 %26, i32 %27, i32 %28, i32 %29, i32 %30, i32 %31, i32 %32, i32 %33, i32 %34, i32 %35, i32 %36, i32 %37, i32 %38) #0 {
+; GFX10-LABEL: main:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX10-NEXT:    s_and_b32 s6, s3, 0xff
+; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, s6, v5
+; GFX10-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz BB0_2
+; GFX10-NEXT:  ; %bb.1: ; %if11500
+; GFX10-NEXT:    buffer_load_format_d16_xyz v[6:7], v11, s[28:31], 0 idxen
+; GFX10-NEXT:    buffer_load_format_d16_xyz v[18:19], v10, s[24:27], 0 idxen
+; GFX10-NEXT:    buffer_load_format_d16_x v8, v12, s[32:35], 0 idxen
+; GFX10-NEXT:    buffer_load_format_d16_x v20, v13, s[36:39], 0 idxen
+; GFX10-NEXT:    buffer_load_format_xyzw v[14:17], v9, s[20:23], 0 idxen
+; GFX10-NEXT:    s_lshr_b32 s7, s3, 18
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0xffff
+; GFX10-NEXT:    v_and_or_b32 v5, 0x3c0, s7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v9, 1.0
+; GFX10-NEXT:    s_mov_b32 s7, 0xffff
+; GFX10-NEXT:    v_mul_u32_u24_e32 v5, 0xc9, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 2, v5
+; GFX10-NEXT:    ds_write_b32 v5, v9 offset:784
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v18
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_bfe_i32 v12, v20, 0, 16
+; GFX10-NEXT:    v_and_b32_e32 v18, v10, v18
+; GFX10-NEXT:    v_and_b32_e32 v11, s7, v19
+; GFX10-NEXT:    v_and_b32_e32 v6, s7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, v10, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write2_b32 v5, v14, v15 offset1:1
+; GFX10-NEXT:    ds_write2_b32 v5, v16, v17 offset0:2 offset1:3
+; GFX10-NEXT:    ds_write2_b32 v5, v12, v8 offset0:4 offset1:5
+; GFX10-NEXT:    ds_write2_b32 v5, v18, v13 offset0:6 offset1:7
+; GFX10-NEXT:    ds_write2_b32 v5, v11, v6 offset0:8 offset1:9
+; GFX10-NEXT:    ds_write2_b32 v5, v9, v7 offset0:10 offset1:11
+; GFX10-NEXT:  BB0_2: ; %endif11500
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: main:
+; GFX9:       ; %bb.0: ; %main_body
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX9-NEXT:    s_and_b32 s6, s3, 0xff
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v5
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT:    s_cbranch_execz BB0_2
+; GFX9-NEXT:  ; %bb.1: ; %if11500
+; GFX9-NEXT:    buffer_load_format_d16_xyz v[6:7], v10, s[24:27], 0 idxen
+; GFX9-NEXT:    buffer_load_format_d16_xyz v[18:19], v11, s[28:31], 0 idxen
+; GFX9-NEXT:    buffer_load_format_d16_x v8, v12, s[32:35], 0 idxen
+; GFX9-NEXT:    buffer_load_format_d16_x v20, v13, s[36:39], 0 idxen
+; GFX9-NEXT:    buffer_load_format_xyzw v[14:17], v9, s[20:23], 0 idxen
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 18
+; GFX9-NEXT:    s_and_b32 s11, s11, 0x3c0
+; GFX9-NEXT:    v_or_b32_e32 v5, s11, v5
+; GFX9-NEXT:    v_mul_u32_u24_e32 v5, 0xc9, v5
+; GFX9-NEXT:    s_mov_b32 s10, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v10, 1.0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 2, v5
+; GFX9-NEXT:    ds_write_b32 v5, v10 offset:784
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v6, v9, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_bfe_i32 v12, v20, 0, 16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX9-NEXT:    v_and_b32_e32 v7, s10, v7
+; GFX9-NEXT:    v_and_b32_e32 v11, s10, v18
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ds_write2_b32 v5, v14, v15 offset1:1
+; GFX9-NEXT:    ds_write2_b32 v5, v16, v17 offset0:2 offset1:3
+; GFX9-NEXT:    v_and_b32_e32 v9, v9, v19
+; GFX9-NEXT:    ds_write2_b32 v5, v12, v8 offset0:4 offset1:5
+; GFX9-NEXT:    ds_write2_b32 v5, v6, v13 offset0:6 offset1:7
+; GFX9-NEXT:    ds_write2_b32 v5, v7, v11 offset0:8 offset1:9
+; GFX9-NEXT:    ds_write2_b32 v5, v10, v9 offset0:10 offset1:11
+; GFX9-NEXT:  BB0_2: ; %endif11500
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: main:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX8-NEXT:    s_and_b32 s6, s3, 0xff
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v5
+; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT:    s_cbranch_execz BB0_2
+; GFX8-NEXT:  ; %bb.1: ; %if11500
+; GFX8-NEXT:    buffer_load_format_d16_xyz v[6:7], v10, s[24:27], 0 idxen
+; GFX8-NEXT:    buffer_load_format_d16_xyz v[18:19], v11, s[28:31], 0 idxen
+; GFX8-NEXT:    buffer_load_format_d16_x v8, v12, s[32:35], 0 idxen
+; GFX8-NEXT:    buffer_load_format_d16_x v20, v13, s[36:39], 0 idxen
+; GFX8-NEXT:    buffer_load_format_xyzw v[14:17], v9, s[20:23], 0 idxen
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 18
+; GFX8-NEXT:    s_and_b32 s11, s11, 0x3c0
+; GFX8-NEXT:    v_or_b32_e32 v5, s11, v5
+; GFX8-NEXT:    v_mul_u32_u24_e32 v5, 0xc9, v5
+; GFX8-NEXT:    s_mov_b32 s10, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v10, 1.0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 2, v5
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_write_b32 v5, v10 offset:784
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, s10, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_bfe_i32 v11, v20, 0, 16
+; GFX8-NEXT:    v_and_b32_e32 v6, v9, v6
+; GFX8-NEXT:    v_alignbit_b32 v13, v19, v18, 16
+; GFX8-NEXT:    v_and_b32_e32 v10, s10, v18
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ds_write2_b32 v5, v14, v15 offset1:1
+; GFX8-NEXT:    ds_write2_b32 v5, v16, v17 offset0:2 offset1:3
+; GFX8-NEXT:    ds_write2_b32 v5, v11, v8 offset0:4 offset1:5
+; GFX8-NEXT:    ds_write2_b32 v5, v6, v12 offset0:6 offset1:7
+; GFX8-NEXT:    ds_write2_b32 v5, v7, v10 offset0:8 offset1:9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v7, v9, v13
+; GFX8-NEXT:    ds_write2_b32 v5, v7, v6 offset0:10 offset1:11
+; GFX8-NEXT:  BB0_2: ; %endif11500
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ; return to shader part epilog
+main_body:
+  %39 = and i32 %3, 255
+  %40 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
+  %41 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %40) #3, !range !0
+  %42 = icmp ult i32 %41, %39
+  br i1 %42, label %if11500, label %endif11500
+
+if11500:                                          ; preds = %main_body
+  %43 = call nsz arcp <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %20, i32 %34, i32 0, i32 0, i32 0) #3
+  %44 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %21, i32 %35, i32 0, i32 0, i32 0) #3
+  %bc4 = bitcast <3 x half> %44 to <3 x i16>
+  %45 = extractelement <3 x i16> %bc4, i32 0
+  %bc5 = bitcast <3 x half> %44 to <3 x i16>
+  %46 = extractelement <3 x i16> %bc5, i32 1
+  %bc6 = bitcast <3 x half> %44 to <3 x i16>
+  %47 = extractelement <3 x i16> %bc6, i32 2
+  %48 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %22, i32 %36, i32 0, i32 0, i32 0) #3
+  %bc7 = bitcast <3 x half> %48 to <3 x i16>
+  %49 = extractelement <3 x i16> %bc7, i32 0
+  %bc8 = bitcast <3 x half> %48 to <3 x i16>
+  %50 = extractelement <3 x i16> %bc8, i32 1
+  %bc9 = bitcast <3 x half> %48 to <3 x i16>
+  %51 = extractelement <3 x i16> %bc9, i32 2
+  %52 = call nsz arcp half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %23, i32 %37, i32 0, i32 0, i32 0) #3
+  %53 = bitcast half %52 to i16
+  %54 = call nsz arcp half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %24, i32 %38, i32 0, i32 0, i32 0) #3
+  %55 = bitcast half %54 to i16
+  %56 = zext i16 %45 to i32
+  %57 = zext i16 %46 to i32
+  %58 = zext i16 %47 to i32
+  %59 = zext i16 %49 to i32
+  %60 = zext i16 %50 to i32
+  %61 = zext i16 %51 to i32
+  %62 = sext i16 %53 to i32
+  %63 = sext i16 %55 to i32
+  %64 = lshr i32 %3, 18
+  %65 = and i32 %64, 960
+  %66 = or i32 %41, %65
+  %67 = mul nuw nsw i32 %66, 201
+  %bc = bitcast <4 x float> %43 to <4 x i32>
+  %68 = extractelement <4 x i32> %bc, i32 0
+  %69 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %67
+  store i32 %68, i32 addrspace(3)* %69, align 4
+  %bc10 = bitcast <4 x float> %43 to <4 x i32>
+  %70 = extractelement <4 x i32> %bc10, i32 1
+  %71 = add nuw nsw i32 %67, 1
+  %72 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %71
+  store i32 %70, i32 addrspace(3)* %72, align 4
+  %bc11 = bitcast <4 x float> %43 to <4 x i32>
+  %73 = extractelement <4 x i32> %bc11, i32 2
+  %74 = add nuw nsw i32 %67, 2
+  %75 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %74
+  store i32 %73, i32 addrspace(3)* %75, align 4
+  %bc12 = bitcast <4 x float> %43 to <4 x i32>
+  %76 = extractelement <4 x i32> %bc12, i32 3
+  %77 = add nuw nsw i32 %67, 3
+  %78 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %77
+  store i32 %76, i32 addrspace(3)* %78, align 4
+  %79 = add nuw nsw i32 %67, 196
+  %80 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %79
+  store i32 1065353216, i32 addrspace(3)* %80, align 4
+  %81 = add nuw nsw i32 %67, 4
+  %82 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %81
+  store i32 %63, i32 addrspace(3)* %82, align 4
+  %83 = add nuw nsw i32 %67, 5
+  %84 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %83
+  store i32 %62, i32 addrspace(3)* %84, align 4
+  %85 = add nuw nsw i32 %67, 6
+  %86 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %85
+  store i32 %56, i32 addrspace(3)* %86, align 4
+  %87 = add nuw nsw i32 %67, 7
+  %88 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %87
+  store i32 %57, i32 addrspace(3)* %88, align 4
+  %89 = add nuw nsw i32 %67, 8
+  %90 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %89
+  store i32 %58, i32 addrspace(3)* %90, align 4
+  %91 = add nuw nsw i32 %67, 9
+  %92 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %91
+  store i32 %59, i32 addrspace(3)* %92, align 4
+  %93 = add nuw nsw i32 %67, 10
+  %94 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %93
+  store i32 %60, i32 addrspace(3)* %94, align 4
+  %95 = add nuw nsw i32 %67, 11
+  %96 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %95
+  store i32 %61, i32 addrspace(3)* %96, align 4
+  br label %endif11500
+
+endif11500:                                       ; preds = %if11500, %main_body
+  %97 = ptrtoint <4 x i32> addrspace(6)* %0 to i32
+  %98 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> undef, i32 %97, 0
+  %99 = ptrtoint <8 x i32> addrspace(6)* %1 to i32
+  %100 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %98, i32 %99, 1
+  %101 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %100, i32 %2, 2
+  %102 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %101, i32 %3, 3
+  %103 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %102, i32 %5, 5
+  %104 = ptrtoint <4 x i32> addrspace(6)* %8 to i32
+  %105 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %103, i32 %104, 8
+  %106 = ptrtoint <8 x i32> addrspace(6)* %9 to i32
+  %107 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %105, i32 %106, 9
+  %108 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %107, i32 %12, 12
+  %109 = bitcast i32 %25 to float
+  %110 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %108, float %109, 13
+  %111 = bitcast i32 %26 to float
+  %112 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %110, float %111, 14
+  %113 = bitcast i32 %27 to float
+  %114 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %112, float %113, 15
+  %115 = bitcast i32 %28 to float
+  %116 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %114, float %115, 16
+  %117 = bitcast i32 %29 to float
+  %118 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %116, float %117, 17
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %118
+}
+
+; Function Attrs: nounwind readnone willreturn
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
+; Function Attrs: nounwind readnone willreturn
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+
+; Function Attrs: nounwind readonly willreturn
+declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2
+
+; Function Attrs: nounwind readonly willreturn
+declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #2
+
+; Function Attrs: nounwind readonly willreturn
+declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32 immarg) #2
+
+attributes #0 = { "amdgpu-32bit-address-high-bits"="0xffff8000" "amdgpu-flat-work-group-size"="128,128" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone willreturn }
+attributes #2 = { nounwind readonly willreturn }
+attributes #3 = { nounwind readnone }
+
+!0 = !{i32 0, i32 64}