diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -0,0 +1,322 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s + +define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + + +define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret float %result +} + + +define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + + +define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret float %result +} + +define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret float [[RESULT]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret float [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret float %result +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -0,0 +1,322 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s + +define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + + +define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret void +} + + +define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + + +define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -0,0 +1,3246 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare float @div.float.value() + +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-NEXT: s_nop 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-NEXT: s_nop 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1164-DPP-NEXT: s_nop 0 +; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_atomic_add_f32 v40, v0, s[34:35] +; GFX1132-DPP-NEXT: s_nop 0 +; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #2 = { strictfp} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -0,0 +1,2338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare float @div.float.value() + +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -0,0 +1,2338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare float @div.float.value() + +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v1, v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v1, v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v0, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -0,0 +1,3445 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare float @div.float.value() + +define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, -4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic + ret void +} + + +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_subrev_f32_e32 v0, 4.0, v1 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s11 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s10 +; GFX7LESS-NEXT: s_mov_b32 s13, s9 +; GFX7LESS-NEXT: s_mov_b32 s12, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s4, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[4:5] +; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s11 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 44 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s12, s8 +; GFX1064-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-NEXT: s_mov_b32 s13, s9 +; GFX1064-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-NEXT: s_mov_b32 s14, s10 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-NEXT: s_getpc_b64 s[6:7] +; GFX1064-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s12, s8 +; GFX1032-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-NEXT: s_mov_b32 s13, s9 +; GFX1032-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-NEXT: s_mov_b32 s14, s10 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-NEXT: s_getpc_b64 s[6:7] +; GFX1032-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s12, s8 +; GFX1164-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-NEXT: s_mov_b32 s13, s9 +; GFX1164-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-NEXT: s_mov_b32 s14, s10 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-NEXT: s_getpc_b64 s[6:7] +; GFX1164-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-NEXT: s_getpc_b64 s[6:7] +; GFX1132-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX9-DPP-NEXT: s_mov_b32 s13, s9 +; GFX9-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s10 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1064-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1064-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1032-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1032-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dword v2, v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s12, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s9 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s10 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1164-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1164-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s4, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s5, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1132-DPP-NEXT: s_getpc_b64 s[6:7] +; GFX1132-DPP-NEXT: s_add_u32 s6, s6, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s7, s7, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[6:7], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1132-DPP-NEXT: global_load_b32 v2, v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call float @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #2 = { strictfp} + diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1,42 +1,334 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX678,HAS-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,HAS-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f32: -; GFX678-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0 -; HAS-ATOMICS: ds_add_rtn_f32 v0, v0, [[K]] - -; NO-ATOMICS: ds_read_b32 -; NO-ATOMICS: v_add_f32 -; NO-ATOMICS: ds_cmpst_rtn_b32 -; NO-ATOMICS: s_cbranch_execnz define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { +; VI-LABEL: lds_atomic_fadd_ret_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 4.0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_add_rtn_f32 v0, v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst ret float %result } -; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f32: -; GFX678-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0 -; HAS-ATOMICS: ds_add_f32 v0, [[K]] define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { +; VI-LABEL: lds_atomic_fadd_noret_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 4.0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_add_f32 v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_f32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}lds_ds_fadd: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(0) -; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { +; VI-LABEL: lds_ds_fadd: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s4, s3, 3 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_add_f32 v2, v0 offset:64 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_add_rtn_f32 v2, v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fadd: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s4, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s3, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fadd: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s4, s3, 3 +; GFX7-NEXT: s_add_i32 s4, s4, 32 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: s_lshl_b32 s6, s3, 3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB2_3: ; %atomicrmw.start2 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_3 +; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB2_5: ; %atomicrmw.start8 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_5 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_ds_fadd: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s4, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: ds_read_b32 v0, v0 offset:32 +; GFX8-NEXT: s_add_i32 s3, s3, 4 +; GFX8-NEXT: s_lshl_b32 s6, s3, 3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: ds_read_b32 v1, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start2 +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB2_3 +; GFX8-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: ds_read_b32 v1, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB2_5: ; %atomicrmw.start8 +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB2_5 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 @@ -49,15 +341,181 @@ ret void } -; GCN-LABEL: {{^}}lds_ds_fadd_one_as: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(1) -; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) { +; VI-LABEL: lds_ds_fadd_one_as: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s4, s3, 3 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: ds_add_f32 v2, v0 offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_add_rtn_f32 v2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fadd_one_as: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s4, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s3, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fadd_one_as: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s4, s3, 3 +; GFX7-NEXT: s_add_i32 s4, s4, 32 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: s_lshl_b32 s6, s3, 3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB3_3: ; %atomicrmw.start2 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_3 +; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB3_5: ; %atomicrmw.start8 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_5 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_ds_fadd_one_as: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s4, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: ds_read_b32 v0, v0 offset:32 +; GFX8-NEXT: s_add_i32 s3, s3, 4 +; GFX8-NEXT: s_lshl_b32 s6, s3, 3 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: ds_read_b32 v1, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB3_3: ; %atomicrmw.start2 +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB3_3 +; GFX8-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: ds_read_b32 v1, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB3_5: ; %atomicrmw.start8 +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB3_5 +; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 @@ -70,61 +528,580 @@ ret void } -; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { +; VI-LABEL: lds_atomic_fadd_ret_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: ds_read_b64 v[0:1], v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB4_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v3, v0 +; VI-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB4_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: ds_read_b64 v[0:1], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %result } -; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { +; VI-LABEL: lds_atomic_fadd_noret_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b64 v[1:2], v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB5_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB5_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[1:2], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[1:2], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b64 v[1:2], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f32: -; GCN: ds_read_b32 -; GCN: v_sub_f32 -; GCN: ds_cmpst_rtn_b32 -; GCN: s_cbranch_execnz define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwind { +; VI-LABEL: lds_atomic_fsub_ret_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_sub_f32_e32 v2, v3, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_ret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_ret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_ret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst ret float %result } -; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f32: -; GCN: ds_read_b32 -; GCN: v_sub_f32 -; GCN: ds_cmpst_rtn_b32 define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwind { +; VI-LABEL: lds_atomic_fsub_noret_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB7_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_sub_f32_e32 v3, v2, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB7_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_noret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_noret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_noret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst ret void } -; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; GCN: ds_cmpst_rtn_b64 - define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounwind { +; VI-LABEL: lds_atomic_fsub_ret_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b64 v[3:4], v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB8_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB8_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_ret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_ret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[3:4], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_ret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b64 v[3:4], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst ret double %result } -; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounwind { +; VI-LABEL: lds_atomic_fsub_noret_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b64 v[3:4], v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; VI-NEXT: v_mov_b32_e32 v3, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_noret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_noret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[3:4], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_noret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b64 v[3:4], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst ret void } diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -1,13 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; GCN-LABEL: {{^}}shl_base_atomicrmw_global_ptr: -; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 -; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc -; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5] -; GCN-DAG: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 -; GCN-DAG: global_atomic_and v[[[LO]]:[[HI]]], [[THREE]], off offset:512 -; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]] define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 { +; GCN-LABEL: shl_base_atomicrmw_global_ptr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[4:5] +; GCN-NEXT: v_mov_b32_e32 v6, 3 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: global_atomic_and v[0:1], v6, off offset:512 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 0x80, v4 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32 %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64 %shl = shl i64 %cast, 2 @@ -17,14 +25,18 @@ ret void } -; GCN-LABEL: {{^}}shl_base_global_ptr_global_atomic_fadd: -; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 -; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc -; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 -; GCN-DAG: global_atomic_add_f32 v[[[LO]]:[[HI]]], [[K]], off offset:512 -; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]] define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 { +; GCN-LABEL: shl_base_global_ptr_global_atomic_fadd: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[4:5] +; GCN-NEXT: v_mov_b32_e32 v6, 0x42c80000 +; GCN-NEXT: global_atomic_add_f32 v[0:1], v6, off offset:512 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 0x80, v4 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32 %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64 %shl = shl i64 %cast, 2