Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -327,6 +327,8 @@ setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); @@ -1384,6 +1386,14 @@ SmallVector Args; unsigned Start = cast(Op.getOperand(1))->getZExtValue(); EVT VT = Op.getValueType(); + EVT SrcVT = Op.getOperand(0).getValueType(); + + // For these types, we have some TableGen patterns except if the index is 1 + if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) || + (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) && + Start != 1) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format2.ll @@ -0,0 +1,283 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" + +@esgs_ring = external addrspace(3) global [0 x i32], align 65536 + +define amdgpu_gs <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> @main(<4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %0, <8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %6, i32 inreg %7, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %8, <8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %9, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %10, <8 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, <4 x i32> addrspace(6)* inreg noalias align 32 dereferenceable(18446744073709551615) %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, <4 x i32> inreg %20, <4 x i32> inreg %21, <4 x i32> inreg %22, <4 x i32> inreg %23, <4 x i32> inreg %24, i32 %25, i32 %26, i32 %27, i32 %28, i32 %29, i32 %30, i32 %31, i32 %32, i32 %33, i32 %34, i32 %35, i32 %36, i32 %37, i32 %38) #0 { +; GFX10-LABEL: main: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX10-NEXT: s_and_b32 s6, s3, 0xff +; GFX10-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, s6, v5 +; GFX10-NEXT: s_and_saveexec_b32 s6, vcc_lo +; GFX10-NEXT: s_cbranch_execz BB0_2 +; GFX10-NEXT: ; %bb.1: ; %if11500 +; GFX10-NEXT: buffer_load_format_d16_xyz v[6:7], v11, s[28:31], 0 idxen +; GFX10-NEXT: buffer_load_format_d16_xyz v[18:19], v10, s[24:27], 0 idxen +; GFX10-NEXT: buffer_load_format_d16_x v8, v12, s[32:35], 0 idxen +; GFX10-NEXT: buffer_load_format_d16_x v20, v13, s[36:39], 0 idxen +; GFX10-NEXT: buffer_load_format_xyzw v[14:17], v9, s[20:23], 0 idxen +; GFX10-NEXT: s_lshr_b32 s7, s3, 18 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX10-NEXT: v_and_or_b32 v5, 0x3c0, s7, v5 +; GFX10-NEXT: v_mov_b32_e32 v9, 1.0 +; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: v_mul_u32_u24_e32 v5, 0xc9, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 2, v5 +; GFX10-NEXT: ds_write_b32 v5, v9 offset:784 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v18 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_bfe_i32 v12, v20, 0, 16 +; GFX10-NEXT: v_and_b32_e32 v18, v10, v18 +; GFX10-NEXT: v_and_b32_e32 v11, s7, v19 +; GFX10-NEXT: v_and_b32_e32 v6, s7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, v10, v7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ds_write2_b32 v5, v14, v15 offset1:1 +; GFX10-NEXT: ds_write2_b32 v5, v16, v17 offset0:2 offset1:3 +; GFX10-NEXT: ds_write2_b32 v5, v12, v8 offset0:4 offset1:5 +; GFX10-NEXT: ds_write2_b32 v5, v18, v13 offset0:6 offset1:7 +; GFX10-NEXT: ds_write2_b32 v5, v11, v6 offset0:8 offset1:9 +; GFX10-NEXT: ds_write2_b32 v5, v9, v7 offset0:10 offset1:11 +; GFX10-NEXT: BB0_2: ; %endif11500 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: main: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX9-NEXT: s_and_b32 s6, s3, 0xff +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v5 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_cbranch_execz BB0_2 +; GFX9-NEXT: ; %bb.1: ; %if11500 +; GFX9-NEXT: buffer_load_format_d16_xyz v[6:7], v10, s[24:27], 0 idxen +; GFX9-NEXT: buffer_load_format_d16_xyz v[18:19], v11, s[28:31], 0 idxen +; GFX9-NEXT: buffer_load_format_d16_x v8, v12, s[32:35], 0 idxen +; GFX9-NEXT: buffer_load_format_d16_x v20, v13, s[36:39], 0 idxen +; GFX9-NEXT: buffer_load_format_xyzw v[14:17], v9, s[20:23], 0 idxen +; GFX9-NEXT: s_lshr_b32 s11, s3, 18 +; GFX9-NEXT: s_and_b32 s11, s11, 0x3c0 +; GFX9-NEXT: v_or_b32_e32 v5, s11, v5 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, 0xc9, v5 +; GFX9-NEXT: s_mov_b32 s10, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v10, 1.0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 2, v5 +; GFX9-NEXT: ds_write_b32 v5, v10 offset:784 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v6, v9, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_i32 v12, v20, 0, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v7, s10, v7 +; GFX9-NEXT: v_and_b32_e32 v11, s10, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write2_b32 v5, v14, v15 offset1:1 +; GFX9-NEXT: ds_write2_b32 v5, v16, v17 offset0:2 offset1:3 +; GFX9-NEXT: v_and_b32_e32 v9, v9, v19 +; GFX9-NEXT: ds_write2_b32 v5, v12, v8 offset0:4 offset1:5 +; GFX9-NEXT: ds_write2_b32 v5, v6, v13 offset0:6 offset1:7 +; GFX9-NEXT: ds_write2_b32 v5, v7, v11 offset0:8 offset1:9 +; GFX9-NEXT: ds_write2_b32 v5, v10, v9 offset0:10 offset1:11 +; GFX9-NEXT: BB0_2: ; %endif11500 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: main: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0 +; GFX8-NEXT: s_and_b32 s6, s3, 0xff +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, s6, v5 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_cbranch_execz BB0_2 +; GFX8-NEXT: ; %bb.1: ; %if11500 +; GFX8-NEXT: buffer_load_format_d16_xyz v[6:7], v10, s[24:27], 0 idxen +; GFX8-NEXT: buffer_load_format_d16_xyz v[18:19], v11, s[28:31], 0 idxen +; GFX8-NEXT: buffer_load_format_d16_x v8, v12, s[32:35], 0 idxen +; GFX8-NEXT: buffer_load_format_d16_x v20, v13, s[36:39], 0 idxen +; GFX8-NEXT: buffer_load_format_xyzw v[14:17], v9, s[20:23], 0 idxen +; GFX8-NEXT: s_lshr_b32 s11, s3, 18 +; GFX8-NEXT: s_and_b32 s11, s11, 0x3c0 +; GFX8-NEXT: v_or_b32_e32 v5, s11, v5 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, 0xc9, v5 +; GFX8-NEXT: s_mov_b32 s10, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v10, 1.0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 2, v5 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_write_b32 v5, v10 offset:784 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v7, s10, v7 +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_i32 v11, v20, 0, 16 +; GFX8-NEXT: v_and_b32_e32 v6, v9, v6 +; GFX8-NEXT: v_alignbit_b32 v13, v19, v18, 16 +; GFX8-NEXT: v_and_b32_e32 v10, s10, v18 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ds_write2_b32 v5, v14, v15 offset1:1 +; GFX8-NEXT: ds_write2_b32 v5, v16, v17 offset0:2 offset1:3 +; GFX8-NEXT: ds_write2_b32 v5, v11, v8 offset0:4 offset1:5 +; GFX8-NEXT: ds_write2_b32 v5, v6, v12 offset0:6 offset1:7 +; GFX8-NEXT: ds_write2_b32 v5, v7, v10 offset0:8 offset1:9 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GFX8-NEXT: v_and_b32_e32 v7, v9, v13 +; GFX8-NEXT: ds_write2_b32 v5, v7, v6 offset0:10 offset1:11 +; GFX8-NEXT: BB0_2: ; %endif11500 +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +main_body: + %39 = and i32 %3, 255 + %40 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 + %41 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %40) #3, !range !0 + %42 = icmp ult i32 %41, %39 + br i1 %42, label %if11500, label %endif11500 + +if11500: ; preds = %main_body + %43 = call nsz arcp <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %20, i32 %34, i32 0, i32 0, i32 0) #3 + %44 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %21, i32 %35, i32 0, i32 0, i32 0) #3 + %bc4 = bitcast <3 x half> %44 to <3 x i16> + %45 = extractelement <3 x i16> %bc4, i32 0 + %bc5 = bitcast <3 x half> %44 to <3 x i16> + %46 = extractelement <3 x i16> %bc5, i32 1 + %bc6 = bitcast <3 x half> %44 to <3 x i16> + %47 = extractelement <3 x i16> %bc6, i32 2 + %48 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %22, i32 %36, i32 0, i32 0, i32 0) #3 + %bc7 = bitcast <3 x half> %48 to <3 x i16> + %49 = extractelement <3 x i16> %bc7, i32 0 + %bc8 = bitcast <3 x half> %48 to <3 x i16> + %50 = extractelement <3 x i16> %bc8, i32 1 + %bc9 = bitcast <3 x half> %48 to <3 x i16> + %51 = extractelement <3 x i16> %bc9, i32 2 + %52 = call nsz arcp half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %23, i32 %37, i32 0, i32 0, i32 0) #3 + %53 = bitcast half %52 to i16 + %54 = call nsz arcp half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %24, i32 %38, i32 0, i32 0, i32 0) #3 + %55 = bitcast half %54 to i16 + %56 = zext i16 %45 to i32 + %57 = zext i16 %46 to i32 + %58 = zext i16 %47 to i32 + %59 = zext i16 %49 to i32 + %60 = zext i16 %50 to i32 + %61 = zext i16 %51 to i32 + %62 = sext i16 %53 to i32 + %63 = sext i16 %55 to i32 + %64 = lshr i32 %3, 18 + %65 = and i32 %64, 960 + %66 = or i32 %41, %65 + %67 = mul nuw nsw i32 %66, 201 + %bc = bitcast <4 x float> %43 to <4 x i32> + %68 = extractelement <4 x i32> %bc, i32 0 + %69 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %67 + store i32 %68, i32 addrspace(3)* %69, align 4 + %bc10 = bitcast <4 x float> %43 to <4 x i32> + %70 = extractelement <4 x i32> %bc10, i32 1 + %71 = add nuw nsw i32 %67, 1 + %72 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %71 + store i32 %70, i32 addrspace(3)* %72, align 4 + %bc11 = bitcast <4 x float> %43 to <4 x i32> + %73 = extractelement <4 x i32> %bc11, i32 2 + %74 = add nuw nsw i32 %67, 2 + %75 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %74 + store i32 %73, i32 addrspace(3)* %75, align 4 + %bc12 = bitcast <4 x float> %43 to <4 x i32> + %76 = extractelement <4 x i32> %bc12, i32 3 + %77 = add nuw nsw i32 %67, 3 + %78 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %77 + store i32 %76, i32 addrspace(3)* %78, align 4 + %79 = add nuw nsw i32 %67, 196 + %80 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %79 + store i32 1065353216, i32 addrspace(3)* %80, align 4 + %81 = add nuw nsw i32 %67, 4 + %82 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %81 + store i32 %63, i32 addrspace(3)* %82, align 4 + %83 = add nuw nsw i32 %67, 5 + %84 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %83 + store i32 %62, i32 addrspace(3)* %84, align 4 + %85 = add nuw nsw i32 %67, 6 + %86 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %85 + store i32 %56, i32 addrspace(3)* %86, align 4 + %87 = add nuw nsw i32 %67, 7 + %88 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %87 + store i32 %57, i32 addrspace(3)* %88, align 4 + %89 = add nuw nsw i32 %67, 8 + %90 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %89 + store i32 %58, i32 addrspace(3)* %90, align 4 + %91 = add nuw nsw i32 %67, 9 + %92 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %91 + store i32 %59, i32 addrspace(3)* %92, align 4 + %93 = add nuw nsw i32 %67, 10 + %94 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %93 + store i32 %60, i32 addrspace(3)* %94, align 4 + %95 = add nuw nsw i32 %67, 11 + %96 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @esgs_ring, i32 0, i32 %95 + store i32 %61, i32 addrspace(3)* %96, align 4 + br label %endif11500 + +endif11500: ; preds = %if11500, %main_body + %97 = ptrtoint <4 x i32> addrspace(6)* %0 to i32 + %98 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> undef, i32 %97, 0 + %99 = ptrtoint <8 x i32> addrspace(6)* %1 to i32 + %100 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %98, i32 %99, 1 + %101 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %100, i32 %2, 2 + %102 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %101, i32 %3, 3 + %103 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %102, i32 %5, 5 + %104 = ptrtoint <4 x i32> addrspace(6)* %8 to i32 + %105 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %103, i32 %104, 8 + %106 = ptrtoint <8 x i32> addrspace(6)* %9 to i32 + %107 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %105, i32 %106, 9 + %108 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %107, i32 %12, 12 + %109 = bitcast i32 %25 to float + %110 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %108, float %109, 13 + %111 = bitcast i32 %26 to float + %112 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %110, float %111, 14 + %113 = bitcast i32 %27 to float + %114 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %112, float %113, 15 + %115 = bitcast i32 %28 to float + %116 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %114, float %115, 16 + %117 = bitcast i32 %29 to float + %118 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %116, float %117, 17 + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %118 +} + +; Function Attrs: nounwind readnone willreturn +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 + +; Function Attrs: nounwind readnone willreturn +declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 + +; Function Attrs: nounwind readonly willreturn +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32 immarg) #2 + +; Function Attrs: nounwind readonly willreturn +declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #2 + +; Function Attrs: nounwind readonly willreturn +declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32 immarg) #2 + +attributes #0 = { "amdgpu-32bit-address-high-bits"="0xffff8000" "amdgpu-flat-work-group-size"="128,128" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind readnone willreturn } +attributes #2 = { nounwind readonly willreturn } +attributes #3 = { nounwind readnone } + +!0 = !{i32 0, i32 64}