Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -898,6 +898,10 @@ "buffer_load_dwordx4", v4i32 >; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; @@ -1794,6 +1798,10 @@ >; } let SubtargetPredicate = isGFX6GFX7 in { +defm : MUBUFStore_Atomic_Pattern ; +defm : MUBUFStore_Atomic_Pattern ; +defm : MUBUFStore_Atomic_Pattern ; +defm : MUBUFStore_Atomic_Pattern ; defm : MUBUFStore_Atomic_Pattern ; defm : MUBUFStore_Atomic_Pattern ; } // End Predicates = isGFX6GFX7 Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1089,6 +1089,10 @@ let OtherPredicates = [HasFlatAddressSpace] in { +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -1126,6 +1130,11 @@ def : FlatStoreAtomicPat ; def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; +def : FlatStoreAtomicPat ; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1310,6 +1319,10 @@ let OtherPredicates = [HasFlatGlobalInsts] in { +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -1369,6 +1382,10 @@ defm : GlobalFLATLoadPats_D16 ; } +defm : GlobalFLATAtomicStorePats ; +defm : GlobalFLATAtomicStorePats ; +defm : GlobalFLATAtomicStorePats ; +defm : GlobalFLATAtomicStorePats ; defm : GlobalFLATAtomicStorePats ; defm : GlobalFLATAtomicStorePats ; Index: llvm/test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -1128,3 +1128,149 @@ store atomic float %in, float* %ptr seq_cst, align 4 ret void } + +; GCN-LABEL: {{^}}atomic_load_i8_offset: +; CIVI: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX9: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GCN: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_load_i8_offset(i8* %in, i8* %out) { +entry: + %gep = getelementptr i8, i8* %in, i64 16 + %val = load atomic i8, i8* %gep seq_cst, align 1 + store i8 %val, i8* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i8: +; GCN: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc +; GCN: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_load_i8(i8* %in, i8* %out) { +entry: + %val = load atomic i8, i8* %in seq_cst, align 1 + store i8 %val, i8* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i8_addr64_offset: +; CIVI: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GFX9: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} +; GCN: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_load_i8_addr64_offset(i8* %in, i8* %out, i64 %index) { +entry: + %ptr = getelementptr i8, i8* %in, i64 %index + %gep = getelementptr i8, i8* %ptr, i64 16 + %val = load atomic i8, i8* %gep seq_cst, align 1 + store i8 %val, i8* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i8_offset: +; CIVI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8* %out) { +entry: + %gep = getelementptr i8, i8* %out, i64 16 + store atomic i8 %in, i8* %gep seq_cst, align 1 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i8: +; GCN: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @atomic_store_i8(i8 %in, i8* %out) { +entry: + store atomic i8 %in, i8* %out seq_cst, align 1 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i8_addr64_offset: +; CIVI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, i8* %out, i64 %index) { +entry: + %ptr = getelementptr i8, i8* %out, i64 %index + %gep = getelementptr i8, i8* %ptr, i64 16 + store atomic i8 %in, i8* %gep seq_cst, align 1 + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i16_offset: +; CIVI: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX9: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_load_i16_offset(i16* %in, i16* %out) { +entry: + %gep = getelementptr i16, i16* %in, i64 8 + %val = load atomic i16, i16* %gep seq_cst, align 2 + store i16 %val, i16* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i16: +; GCN: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc +; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_load_i16(i16* %in, i16* %out) { +entry: + %val = load atomic i16, i16* %in seq_cst, align 2 + store i16 %val, i16* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i16_addr64_offset: +; CIVI: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GFX9: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} +; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_load_i16_addr64_offset(i16* %in, i16* %out, i64 %index) { +entry: + %ptr = getelementptr i16, i16* %in, i64 %index + %gep = getelementptr i16, i16* %ptr, i64 8 + %val = load atomic i16, i16* %gep seq_cst, align 2 + store i16 %val, i16* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i16_offset: +; CIVI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16* %out) { +entry: + %gep = getelementptr i16, i16* %out, i64 8 + store atomic i16 %in, i16* %gep seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i16: +; GCN: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @atomic_store_i16(i16 %in, i16* %out) { +entry: + store atomic i16 %in, i16* %out seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i16_addr64_offset: +; CIVI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, i16* %out, i64 %index) { +entry: + %ptr = getelementptr i16, i16* %out, i64 %index + %gep = getelementptr i16, i16* %ptr, i64 8 + store atomic i16 %in, i16* %gep seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_f16_offset: +; CIVI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_f16_offset(half %in, half* %out) { +entry: + %gep = getelementptr half, half* %out, i64 8 + store atomic half %in, half* %gep seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_f16: +; GCN: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @atomic_store_f16(half %in, half* %out) { +entry: + store atomic half %in, half* %out seq_cst, align 2 + ret void +} Index: llvm/test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -1316,3 +1316,124 @@ store atomic float %in, float addrspace(1)* %ptr seq_cst, align 4 ret void } + +; GCN-LABEL: {{^}}atomic_load_i8_offset: +; SIVI: buffer_load_ubyte [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; SIVI: buffer_store_byte [[RET]] + +; GFX9: global_load_ubyte [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +define amdgpu_kernel void @atomic_load_i8_offset(i8 addrspace(1)* %in, i8 addrspace(1)* %out) { +entry: + %gep = getelementptr i8, i8 addrspace(1)* %in, i64 16 + %val = load atomic i8, i8 addrspace(1)* %gep seq_cst, align 1 + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i8_negoffset: +; SI: buffer_load_ubyte [[RET:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} + +; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00 +; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1 +; VI: flat_load_ubyte [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} + +; GFX9: global_load_ubyte [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}} +define amdgpu_kernel void @atomic_load_i8_negoffset(i8 addrspace(1)* %in, i8 addrspace(1)* %out) { +entry: + %gep = getelementptr i8, i8 addrspace(1)* %in, i64 -512 + %val = load atomic i8, i8 addrspace(1)* %gep seq_cst, align 1 + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i8_offset: +; SI: buffer_store_byte {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +; VI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} +; GFX9: global_store_byte {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, i8 addrspace(1)* %out) { +entry: + %gep = getelementptr i8, i8 addrspace(1)* %out, i64 16 + store atomic i8 %in, i8 addrspace(1)* %gep seq_cst, align 1 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i8: +; SI: buffer_store_byte {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +; VI: flat_store_byte v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} +; GFX9: global_store_byte {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_store_i8(i8 %in, i8 addrspace(1)* %out) { +entry: + store atomic i8 %in, i8 addrspace(1)* %out seq_cst, align 1 + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i16_offset: +; SIVI: buffer_load_ushort [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; SIVI: buffer_store_short [[RET]] + +; GFX9: global_load_ushort [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +define amdgpu_kernel void @atomic_load_i16_offset(i16 addrspace(1)* %in, i16 addrspace(1)* %out) { +entry: + %gep = getelementptr i16, i16 addrspace(1)* %in, i64 8 + %val = load atomic i16, i16 addrspace(1)* %gep seq_cst, align 2 + store i16 %val, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_load_i16_negoffset: +; SI: buffer_load_ushort [[RET:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} + +; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xfffffe00 +; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1 +; VI: flat_load_ushort [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} + +; GFX9: global_load_ushort [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}} +define amdgpu_kernel void @atomic_load_i16_negoffset(i16 addrspace(1)* %in, i16 addrspace(1)* %out) { +entry: + %gep = getelementptr i16, i16 addrspace(1)* %in, i64 -256 + %val = load atomic i16, i16 addrspace(1)* %gep seq_cst, align 2 + store i16 %val, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i16_offset: +; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} +; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, i16 addrspace(1)* %out) { +entry: + %gep = getelementptr i16, i16 addrspace(1)* %out, i64 8 + store atomic i16 %in, i16 addrspace(1)* %gep seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_i16: +; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} +; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_store_i16(i16 %in, i16 addrspace(1)* %out) { +entry: + store atomic i16 %in, i16 addrspace(1)* %out seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_f16_offset: +; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} +; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +define amdgpu_kernel void @atomic_store_f16_offset(half %in, half addrspace(1)* %out) { +entry: + %gep = getelementptr half, half addrspace(1)* %out, i64 8 + store atomic half %in, half addrspace(1)* %gep seq_cst, align 2 + ret void +} + +; GCN-LABEL: {{^}}atomic_store_f16: +; SI: buffer_store_short {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +; VI: flat_store_short v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} +; GFX9: global_store_short {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_store_f16(half %in, half addrspace(1)* %out) { +entry: + store atomic half %in, half addrspace(1)* %out seq_cst, align 2 + ret void +}