Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -222,23 +222,43 @@ // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// -class PrivateMemOp : PatFrag (N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; -}]>; +class LoadFrag : PatFrag<(ops node:$ptr), (op node:$ptr)>; -class PrivateLoad : PrivateMemOp < - (ops node:$ptr), (op node:$ptr) +class StoreFrag : PatFrag < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) >; -class PrivateStore : PrivateMemOp < - (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +class StoreHi16 : PatFrag < + (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr) >; -def load_private : PrivateLoad ; +class PrivateAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; +}]>; + +class LocalAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; +}]>; + +class GlobalAddress : CodePatPred<[{ + return cast(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; +}]>; + +class FlatLoadAddress : CodePatPred<[{ + const auto AS = cast(N)->getAddressSpace(); + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS; +}]>; + + +def load_private : LoadFrag , PrivateAddress; +def truncstorei8_private : StoreFrag, PrivateAddress; +def truncstorei16_private : StoreFrag , PrivateAddress; +def store_private : StoreFrag , PrivateAddress; + +def store_private_hi16 : StoreHi16 , PrivateAddress; +def truncstorei8_private_hi16 : StoreHi16, PrivateAddress; -def truncstorei8_private : PrivateStore ; -def truncstorei16_private : PrivateStore ; -def store_private : PrivateStore ; class GlobalMemOp : PatFrag (N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; @@ -315,8 +335,12 @@ def az_extloadi8_local : LocalLoad ; def sextloadi8_local : LocalLoad ; -def extloadi8_private : PrivateLoad ; -def sextloadi8_private : PrivateLoad ; +def store_local_hi16 : StoreHi16 , LocalAddress; +def truncstorei8_local_hi16 : StoreHi16, LocalAddress; + + +def extloadi8_private : LoadFrag , PrivateAddress; +def sextloadi8_private : LoadFrag , PrivateAddress; def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i16; @@ -331,8 +355,8 @@ def az_extloadi16_local : LocalLoad ; def sextloadi16_local : LocalLoad ; -def extloadi16_private : PrivateLoad ; -def sextloadi16_private : PrivateLoad ; +def extloadi16_private : LoadFrag , PrivateAddress; +def sextloadi16_private : LoadFrag , PrivateAddress; def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i32; @@ -347,9 +371,12 @@ def truncstorei8_global : GlobalStore ; def truncstorei16_global : GlobalStore ; -def local_store : LocalStore ; -def truncstorei8_local : LocalStore ; -def truncstorei16_local : LocalStore ; +def truncstorei8_global_hi16 : StoreHi16 , GlobalAddress; +def truncstorei16_global_hi16 : StoreHi16 , GlobalAddress; + +def local_store : StoreFrag , LocalAddress; +def truncstorei8_local : StoreFrag , LocalAddress; +def truncstorei16_local : StoreFrag , LocalAddress; def local_load : LocalLoad ; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1261,6 +1261,16 @@ defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; + +let Predicates = [HasD16LoadStore] in { + // Hiding the extract high pattern in the PatFrag seems to not + // automatically increase the complexity. +let AddedComplexity = 1 in { +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +} +} + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -576,6 +576,11 @@ def : DSWritePat ; def : DSWritePat ; +let Predicates = [HasD16LoadStore] in { +def : DSWritePat ; +def : DSWritePat ; +} + let AddedComplexity = 100 in { def : DSWritePat ; Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -647,6 +647,10 @@ def flat_truncstorei8 : flat_st ; def flat_truncstorei16 : flat_st ; +def flat_truncstorei8_hi16 : StoreHi16, FlatLoadAddress; +def flat_truncstorei16_hi16 : StoreHi16, FlatLoadAddress; + + // Patterns for global loads with no offset. class FlatLoadPat : Pat < (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), @@ -753,6 +757,12 @@ def : FlatStorePat ; def : FlatStorePat ; + + let Predicates = [HasD16LoadStore] in { +def : FlatStorePat ; +def : FlatStorePat ; +} + } // End Predicates = [HasFlatAddressSpace] let Predicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { @@ -780,6 +790,13 @@ def : FlatStoreSignedPat ; def : FlatStoreSignedPat ; + + let Predicates = [HasD16LoadStore] in { +def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; +} + + def : FlatStoreSignedAtomicPat ; def : FlatStoreSignedAtomicPat ; Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: @@ -154,7 +154,8 @@ ; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]] -; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] +; CIVI: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] +; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[NEG]], off define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fneg = fsub <2 x half> , %val Index: test/CodeGen/AMDGPU/store-global.ll =================================================================== --- test/CodeGen/AMDGPU/store-global.ll +++ test/CodeGen/AMDGPU/store-global.ll @@ -85,11 +85,11 @@ } ; FUNC-LABEL: {{^}}store_i24: -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 ; SIVI-DAG: buffer_store_byte ; SIVI-DAG: buffer_store_short -; GFX9-DAG: global_store_byte +; GFX9-DAG: global_store_byte_d16_hi v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off offset:2 ; GFX9-DAG: global_store_short ; EG: MEM_RAT MSKOR Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -0,0 +1,594 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s + +; GCN-LABEL: {{^}}store_global_hi_v2i16: +; GCN: s_waitcnt + +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_short v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + store i16 %hi, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_v2f16: +; GCN: s_waitcnt + +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_short v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x half> + %hi = extractelement <2 x half> %value, i32 1 + store half %hi, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_i32_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_short v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i16 + store i16 %hi, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_v2i16_i8: +; GCN: s_waitcnt + +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_byte v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_i8_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_byte v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i8 + store i8 %hi, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset: +; GCN: s_waitcnt +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094 + +; VI-DAG: v_add_i32_e32 +; VI-DAG: v_addc_u32_e32 +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 + +; VI: flat_store_short v[0:1], v2{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047 + store i16 %hi, i16 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset: +; GCN: s_waitcnt +; GFX9-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}} + +; VI-DAG: v_add_i32_e32 +; VI-DAG: v_addc_u32_e32 +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 + +; VI: flat_store_short v[0:1], v{{[0-9]$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048 + store i16 %hi, i16 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset: +; GCN: s_waitcnt +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095 + +; VI-DAG: v_add_i32_e32 +; VI-DAG: v_addc_u32_e32 +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 +; VI: flat_store_byte v[0:1], v{{[0-9]$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095 + store i8 %trunc, i8 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset: +; GCN: s_waitcnt +; GFX9-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095 + +; VI-DAG: v_add_i32_e32 +; VI-DAG: v_addc_u32_e32 +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 + +; VI: flat_store_byte v[0:1], v{{[0-9]$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095 + store i8 %trunc, i8 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2i16: +; GCN: s_waitcnt + +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_short v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2i16(i16 addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + store i16 %hi, i16 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2f16: +; GCN: s_waitcnt + +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_short v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2f16(half addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x half> + %hi = extractelement <2 x half> %value, i32 1 + store half %hi, half addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_i32_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_short v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_i32_shift(i16 addrspace(4)* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i16 + store i16 %hi, i16 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8: +; GCN: s_waitcnt + +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_byte v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2i16_i8(i8 addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + store i8 %trunc, i8 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_i8_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: flat_store_byte v[0:1], v2 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_i8_shift(i8 addrspace(4)* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i8 + store i8 %hi, i8 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset: +; GCN: s_waitcnt +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}} + +; VI-DAG: v_add_i32_e32 +; VI-DAG: v_addc_u32_e32 +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 +; VI: flat_store_short v[0:1], v2{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2i16_max_offset(i16 addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds i16, i16 addrspace(4)* %out, i64 2047 + store i16 %hi, i16 addrspace(4)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset: +; GCN: s_waitcnt +; GCN: v_add_i32_e32 +; GCN: v_addc_u32_e32 + +; GFX9-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} +; VI: flat_store_short v[0:1], v2{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2i16_neg_offset(i16 addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds i16, i16 addrspace(4)* %out, i64 -1023 + store i16 %hi, i16 addrspace(4)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset: +; GCN: s_waitcnt +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}} + +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 +; VI-DAG: v_add_i32_e32 +; VI-DAG: v_addc_u32_e32 +; VI: flat_store_byte v[0:1], v2{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2i16_i8_max_offset(i8 addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + %gep = getelementptr inbounds i8, i8 addrspace(4)* %out, i64 4095 + store i8 %trunc, i8 addrspace(4)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset: +; GCN: s_waitcnt +; GCN-DAG: v_add_i32_e32 +; GCN-DAG: v_addc_u32_e32 + +; GFX9-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} +; VI-DAG: v_lshrrev_b32_e32 v2, 16, v2 +; VI: flat_store_byte v[0:1], v2{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_flat_hi_v2i16_i8_neg_offset(i8 addrspace(4)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + %gep = getelementptr inbounds i8, i8 addrspace(4)* %out, i64 -4095 + store i8 %trunc, i8 addrspace(4)* %gep + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_v2i16: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_v2i16(i16* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + store i16 %hi, i16* %out + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_v2f16: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_v2f16(half* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x half> + %hi = extractelement <2 x half> %value, i32 1 + store half %hi, half* %out + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_i32_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_i32_shift(i16* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i16 + store i16 %hi, i16* %out + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_v2i16_i8(i8* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + store i8 %trunc, i8* %out + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_i8_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_i8_shift(i8* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i8 + store i8 %hi, i8* %out + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen offset:4094{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds i16, i16* %out, i64 2047 + store i16 %hi, i16* %gep + ret void +} + + + +; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s4{{$}} + +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], s4{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_v2i16_nooff(i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + store volatile i16 %hi, i16* null + ret void +} + + +; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: +; GCN: s_waitcnt + +; GFX9-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s4{{$}} + +; VI: v_lshrrev_b32_e32 v0, 16, v0 +; VI: buffer_store_byte v0, off, s[0:3], s4{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + store volatile i8 %trunc, i8* null + ret void +} + +; GCN-LABEL: {{^}}store_local_hi_v2i16: +; GCN: s_waitcnt + +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: ds_write_b16 v0, v1 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + store i16 %hi, i16 addrspace(3)* %out + ret void +} + +; GCN-LABEL: {{^}}store_local_hi_v2f16: +; GCN: s_waitcnt + +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: ds_write_b16 v0, v1 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x half> + %hi = extractelement <2 x half> %value, i32 1 + store half %hi, half addrspace(3)* %out + ret void +} + +; GCN-LABEL: {{^}}store_local_hi_i32_shift: +; GCN: s_waitcnt + +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: ds_write_b16 v0, v1 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 { +entry: + %hi32 = lshr i32 %value, 16 + %hi = trunc i32 %hi32 to i16 + store i16 %hi, i16 addrspace(3)* %out + ret void +} + +; GCN-LABEL: {{^}}store_local_hi_v2i16_i8: +; GCN: s_waitcnt + +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: ds_write_b8 v0, v1 + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 { +entry: + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %trunc = trunc i16 %hi to i8 + store i8 %trunc, i8 addrspace(3)* %out + ret void +} + +; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset: +; GCN: s_waitcnt +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}} + +; VI: v_lshrrev_b32_e32 v1, 16, v1 +; VI: ds_write_b16 v0, v1 offset:65534{{$}} + +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 { +entry: + ; FIXME: ABI for pre-gfx9 + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767 + store i16 %hi, i16 addrspace(3)* %gep + ret void +} + +attributes #0 = { nounwind }