diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -278,12 +278,12 @@ static cl::opt EnableAtomicOptimizations("amdgpu-atomic-optimizations", cl::desc("Enable atomic optimizations"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt AMDGPUAtomicOptimizerStrategy( "amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), - cl::init(ScanOptions::DPP), + cl::init(ScanOptions::Iterative), cl::values(clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -amdgpu-atomic-optimizations=false < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -amdgpu-atomic-optimizations=false < %s | FileCheck -check-prefix=GFX7 %s ; Test end to end matching of addressing modes when MUBUF is used for ; global memory. diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s declare i1 @llvm.amdgcn.wqm.vote(i1) declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizations=false -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) dereferenceable(18446744073709551615) %arg0, i32 %arg1) { ; GCN-LABEL: name: mmo_offsets0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx90a -o - %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizations=false -o - %s | FileCheck %s %S = type <{ float, double }> diff --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @gds0 = internal addrspace(2) global [4 x i32] undef, align 4 @lds0 = internal addrspace(3) global [4 x i32] undef, align 128 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -251,11 +251,14 @@ ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR late optimizations +; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Lazy Value Information Analysis ; GCN-O1-NEXT: Lower SwitchInst's to branches @@ -541,11 +544,14 @@ ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations +; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches @@ -847,11 +853,14 @@ ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR late optimizations +; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches @@ -1168,11 +1177,14 @@ ; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR late optimizations +; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -march=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs --amdgpu-lower-module-lds-strategy=module -amdgpu-atomic-optimizations=false < %s | FileCheck -check-prefix=GCN %s ; Check that barrier or fence in between of loads is not considered a clobber ; for the purpose of converting vector loads into scalar. diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 { ; GCN-LABEL: should_not_hoist_set_inactive: