diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -278,12 +278,12 @@ static cl::opt EnableAtomicOptimizations("amdgpu-atomic-optimizations", cl::desc("Enable atomic optimizations"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt AMDGPUAtomicOptimizerStrategy( "amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), - cl::init(ScanOptions::DPP), + cl::init(ScanOptions::Iterative), cl::values(clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -902,14 +902,36 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_cbranch_execz .LBB32_4 +; GFX6-NEXT: ; %bb.1: +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_cbranch_execz .LBB32_3 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_movk_i32 s4, 0x3ffc +; GFX6-NEXT: s_movk_i32 s8, 0x3ffc ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX6-NEXT: buffer_atomic_add v1, off, s[0:3], s8 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: .LBB32_3: +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: .LBB32_4: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -917,14 +939,36 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_execz .LBB32_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB32_3 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_lshl_b32 s2, s2, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_movk_i32 s4, 0x3ffc +; GFX7-NEXT: s_movk_i32 s8, 0x3ffc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX7-NEXT: buffer_atomic_add v1, off, s[0:3], s8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: .LBB32_3: +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: .LBB32_4: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst @@ -935,37 +979,81 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) { ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_cbranch_execz .LBB33_4 +; GFX6-NEXT: ; %bb.1: +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_cbranch_execz .LBB33_3 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_mov_b32 s9, 4 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_mov_b32 s2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v1, v[2:3], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: .LBB33_3: +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: .LBB33_4: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, 0 -; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_execz .LBB33_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_mov_b32 s8, 0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB33_3 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_lshl_b32 s2, s2, 1 +; GFX7-NEXT: s_mov_b32 s9, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_mov_b32 s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v1, v[2:3], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: .LBB33_3: +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: .LBB33_4: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s declare i1 @llvm.amdgcn.wqm.vote(i1) declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizations=false -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) dereferenceable(18446744073709551615) %arg0, i32 %arg1) { ; GCN-LABEL: name: mmo_offsets0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -9,14 +9,27 @@ ; CHECK-LABEL: add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_add v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_add_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -31,14 +44,27 @@ ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_sub v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_sub_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -53,14 +79,26 @@ ; CHECK-LABEL: and: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_and v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_and_b32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -75,14 +113,28 @@ ; CHECK-LABEL: or: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB3_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_or v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_or_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -97,14 +149,29 @@ ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_xor v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -155,14 +222,27 @@ ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_smax v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_max_i32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -212,14 +292,27 @@ ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB8_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_smin v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, -2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_min_i32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -269,14 +362,28 @@ ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umax v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_max_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -326,14 +433,26 @@ ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB12_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_umin v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_min_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -616,15 +735,29 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB23_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_add_u32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -638,15 +771,29 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB24_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_sub_u32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -660,16 +807,30 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB25_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, -2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_min_i32_e32 v0, s2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -682,16 +843,30 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB26_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_max_i32_e32 v0, s2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -704,15 +879,28 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB27_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_min_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -726,16 +914,30 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB28_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_max_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -748,15 +950,28 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB29_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_and_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -770,16 +985,30 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB30_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_or_b32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.or.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -792,15 +1021,31 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB31_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_xor v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_xor_b32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -10,16 +10,28 @@ define amdgpu_kernel void @alloc_lds_gds(ptr addrspace(1) %out) #1 { ; GCN-LABEL: alloc_lds_gds: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 5 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 gds ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3 %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel @@ -32,18 +44,44 @@ define amdgpu_kernel void @alloc_lds_gds_align(ptr addrspace(1) %out) #1 { ; GCN-LABEL: alloc_lds_gds_align: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 5 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 gds ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:140 +; GCN-NEXT: ds_add_u32 v0, v1 offset:140 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB1_2: +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: ; %bb.3: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB1_4: ; GCN-NEXT: s_endpgm %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3 %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -251,11 +251,14 @@ ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR late optimizations +; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Lazy Value Information Analysis ; GCN-O1-NEXT: Lower SwitchInst's to branches @@ -541,11 +544,14 @@ ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations +; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches @@ -847,11 +853,14 @@ ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR late optimizations +; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches @@ -1168,11 +1177,14 @@ ; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR late optimizations +; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -march=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module -amdgpu-atomic-optimizations=false < %s | FileCheck -check-prefix=GCN %s ; Check that barrier or fence in between of loads is not considered a clobber ; for the purpose of converting vector loads into scalar. diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -7,19 +7,20 @@ ; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2 -; GCN-NEXT: s_mov_b32 s7, 0 +; GCN-NEXT: s_mov_b32 s8, 0 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GCN-NEXT: s_and_b32 s8, exec_lo, s6 -; GCN-NEXT: s_or_b32 s7, s8, s7 -; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 -; GCN-NEXT: s_cbranch_execz .LBB0_5 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GCN-NEXT: s_and_b32 s7, exec_lo, s6 +; GCN-NEXT: s_or_b32 s8, s7, s8 +; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GCN-NEXT: s_cbranch_execz .LBB0_8 ; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GCN-NEXT: ; =>This Loop Header: Depth=1 +; GCN-NEXT: ; Child Loop BB0_5 Depth 2 +; GCN-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.3: ; %bb1 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -27,18 +28,39 @@ ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo -; GCN-NEXT: s_or_saveexec_b32 s9, -1 +; GCN-NEXT: s_or_saveexec_b32 s7, -1 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf -; GCN-NEXT: s_mov_b32 exec_lo, s9 +; GCN-NEXT: s_mov_b32 exec_lo, s7 ; GCN-NEXT: v_mov_b32_e32 v0, v4 ; GCN-NEXT: s_and_b32 exec_lo, exec_lo, s5 ; GCN-NEXT: s_cbranch_execz .LBB0_1 ; GCN-NEXT: ; %bb.4: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GCN-NEXT: s_mov_b32 s7, exec_lo +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: .LBB0_5: ; %ComputeLoop +; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 +; GCN-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-NEXT: s_ff1_i32_b32 s11, s7 +; GCN-NEXT: v_readlane_b32 s12, v0, s11 +; GCN-NEXT: s_lshl_b32 s11, 1, s11 +; GCN-NEXT: s_andn2_b32 s7, s7, s11 +; GCN-NEXT: s_add_i32 s10, s10, s12 +; GCN-NEXT: s_cmp_lg_u32 s7, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_5 +; GCN-NEXT: ; %bb.6: ; %ComputeEnd +; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: v_cmp_eq_u32_e64 s7, 0, v1 +; GCN-NEXT: s_and_saveexec_b32 s11, s7 +; GCN-NEXT: s_xor_b32 s11, exec_lo, s11 +; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: ; %bb.7: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: buffer_atomic_add v0, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB0_1 -; GCN-NEXT: .LBB0_5: ; %bb5 +; GCN-NEXT: .LBB0_8: ; %bb5 ; GCN-NEXT: s_endpgm .entry: br label %bb