Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -813,7 +813,7 @@ let SubtargetPredicate = HasD16LoadStore in { defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16", VGPR_32, i32 + "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1 >; defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads < @@ -821,7 +821,7 @@ >; defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16", VGPR_32, i32 + "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1 >; defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads < @@ -829,7 +829,7 @@ >; defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < - "buffer_load_short_d16", VGPR_32, i32 + "buffer_load_short_d16", VGPR_32, i32, null_frag, 1 >; defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads < @@ -1182,6 +1182,33 @@ >; } +multiclass MUBUFScratchLoadPat_Lo16 { + def : Pat < + (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), vt:$hi), + (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; + + def : Pat < + (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))))), f16:$hi), + (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; + + + def : Pat < + (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), vt:$hi), + (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; + + def : Pat < + (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))), f16:$hi), + (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; +} + defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; @@ -1197,6 +1224,10 @@ defm : MUBUFScratchLoadPat_Hi16; defm : MUBUFScratchLoadPat_Hi16; defm : MUBUFScratchLoadPat_Hi16; + +defm : MUBUFScratchLoadPat_Lo16; +defm : MUBUFScratchLoadPat_Lo16; +defm : MUBUFScratchLoadPat_Lo16; } // BUFFER_LOAD_DWORD*, addr64=0 Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -561,6 +561,19 @@ >; } +multiclass DSReadPat_Lo16 { + def : Pat < + (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), vt:$hi), + (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $hi)) + >; + + def : Pat < + (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), f16:$hi), + (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $hi)) + >; +} + + def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; @@ -589,6 +602,11 @@ defm : DSReadPat_Hi16; defm : DSReadPat_Hi16; defm : DSReadPat_Hi16; + +defm : DSReadPat_Lo16; +defm : DSReadPat_Lo16; +defm : DSReadPat_Lo16; + } } Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -654,6 +654,30 @@ >; } +multiclass FlatLoadPat_Lo16 { + def : Pat < + (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), vt:$elt1), + (v2i16 (inst $vaddr, $offset, 0, $slc, $elt1)) + >; + + def : Pat < + (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), f16:$elt1), + (v2f16 (inst $vaddr, $offset, 0, $slc, $elt1)) + >; +} + +multiclass FlatSignedLoadPat_Lo16 { + def : Pat < + (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), vt:$elt1), + (v2i16 (inst $vaddr, $offset, 0, $slc, $elt1)) + >; + + def : Pat < + (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), f16:$elt1), + (v2f16 (inst $vaddr, $offset, 0, $slc, $elt1)) + >; +} + class FlatLoadAtomicPat : Pat < (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), (inst $vaddr, $offset, 0, $slc) @@ -764,6 +788,10 @@ defm : FlatLoadPat_Hi16 ; defm : FlatLoadPat_Hi16 ; defm : FlatLoadPat_Hi16 ; + +defm : FlatLoadPat_Lo16 ; +defm : FlatLoadPat_Lo16 ; +defm : FlatLoadPat_Lo16 ; } } @@ -801,6 +829,11 @@ defm : FlatSignedLoadPat_Hi16 ; defm : FlatSignedLoadPat_Hi16 ; defm : FlatSignedLoadPat_Hi16 ; + +defm : FlatSignedLoadPat_Lo16 ; +defm : FlatSignedLoadPat_Lo16 ; +defm : FlatSignedLoadPat_Lo16 ; + } def : FlatStoreSignedAtomicPat ; Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -100,15 +100,16 @@ ; SICIVI: buffer_store_short ; SICIVI: buffer_store_short -; GFX9: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_store_short + ; GFX9: buffer_load_ushort ; GFX9: global_load_short_d16_hi - +; GFX9: global_load_short_d16 v ; GFX9: buffer_store_dword ; GFX9: buffer_store_dword - -; GCN: buffer_load_ushort -; GCN: buffer_store_short +; GFX9: buffer_load_ushort +; GFX9: buffer_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -503,4 +503,102 @@ ret void } +; FIXME: Remove m0 init and waitcnt between reads +; FIXME: Is there a cost to using the extload over not? +; GCN-LABEL: {{^}}load_local_v2i16_split: +; GCN: s_waitcnt +; GFX9-NEXT: s_mov_b32 m0, -1 +; GFX9-NEXT: ds_read_u16 v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 + %load0 = load volatile i16, i16 addrspace(3)* %in + %load1 = load volatile i16, i16 addrspace(3)* %gep + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 + ret <2 x i16> %build1 +} + +; FIXME: Remove waitcnt between reads +; GCN-LABEL: {{^}}load_global_v2i16_split: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_ushort v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_load_short_d16_hi v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 + %load0 = load volatile i16, i16 addrspace(1)* %in + %load1 = load volatile i16, i16 addrspace(1)* %gep + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 + ret <2 x i16> %build1 +} + +; FIXME: Remove waitcnt between reads +; GCN-LABEL: {{^}}load_flat_v2i16_split: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_ushort v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: flat_load_short_d16_hi v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @load_flat_v2i16_split(i16 addrspace(4)* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 + %load0 = load volatile i16, i16 addrspace(4)* %in + %load1 = load volatile i16, i16 addrspace(4)* %gep + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 + ret <2 x i16> %build1 +} + +; FIXME: Remove waitcnt between reads +; GCN-LABEL: {{^}}load_constant_v2i16_split: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_ushort v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_load_short_d16_hi v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @load_constant_v2i16_split(i16 addrspace(2)* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 1 + %load0 = load volatile i16, i16 addrspace(2)* %in + %load1 = load volatile i16, i16 addrspace(2)* %gep + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 + ret <2 x i16> %build1 +} + +; FIXME: Remove m0 init and waitcnt between reads +; FIXME: Is there a cost to using the extload over not? +; GCN-LABEL: {{^}}load_private_v2i16_split: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s4 offen{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @load_private_v2i16_split(i16* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16* %in, i32 1 + %load0 = load volatile i16, i16* %in + %load1 = load volatile i16, i16* %gep + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 + ret <2 x i16> %build1 +} + attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -0,0 +1,483 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s + +; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16 v0, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + ret <2 x i16> %build +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16 v0, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + ret <2 x i16> %build1 +} + +; Show that we get reasonable regalloc without physreg constraints. +; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16 v0, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v0, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo: +; GCN: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_read_u16_d16 v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + ret <2 x i16> %build +} + +; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16 v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { +entry: + %load = load half, half addrspace(3)* %in + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %load, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u8_d16 v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u8 +define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_i8_d16 v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_i8 v +define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 + %load = load i16, i16 addrspace(1)* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 + %load = load half, half addrspace(1)* %gep + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %load, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort v{{[0-9]+}} +; VI: v_or_b32_e32 +define void @load_flat_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { +entry: + %load = load i16, i16 addrspace(4)* %in + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort v{{[0-9]+}} +; VI: v_or_b32_e32 +define void @load_flat_lo_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { +entry: + %load = load half, half addrspace(4)* %in + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %load, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_ubyte_d16 v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ubyte v{{[0-9]+}} +; VI: v_or_b32_e32 +define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(4)* %in + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_sbyte_d16 v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_sbyte v{{[0-9]+}} +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(4)* %in + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} +define void @load_private_lo_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i16, i16* %in, i64 2047 + %load = load i16, i16* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} +define void @load_private_lo_v2f16_reglo_vreg(half* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds half, half* %in, i64 2047 + %load = load half, half* %gep + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %load, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_lo_v2i16_reglo_vreg_nooff(i16* %in, i16 %reg) #0 { +entry: + %load = load volatile i16, i16* inttoptr (i32 4094 to i16*) + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_lo_v2f16_reglo_vreg_nooff(half* %in, half %reg) #0 { +entry: + %load = load volatile half, half* inttoptr (i32 4094 to half*) + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %load, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ubyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} +define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %load = load i8, i8* %gep + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_sbyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} +define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %load = load i8, i8* %gep + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i16 %reg) #0 { +entry: + %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i16 %reg) #0 { +entry: + %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8* %in, half %reg) #0 { +entry: + %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %ext = zext i8 %load to i16 + %bc.ext = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort +define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(2)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047 + %load = load i16, i16 addrspace(2)* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort +define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(2)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047 + %load = load half, half addrspace(2)* %gep + %build0 = insertelement <2 x half> undef, half %reg, i32 1 + %build1 = insertelement <2 x half> %build0, half %load, i32 0 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind }