diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -173,12 +173,6 @@ "fast", "fast register allocator", createFastVGPRRegisterAllocator); } -static cl::opt EnableSROA( - "amdgpu-sroa", - cl::desc("Run SROA after promote alloca pass"), - cl::ReallyHidden, - cl::init(true)); - static cl::opt EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), @@ -1004,8 +998,6 @@ if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createAMDGPUPromoteAlloca()); - if (EnableSROA) - addPass(createSROAPass()); if (isPassEnabled(EnableScalarIRPasses)) addStraightLineScalarOptimizationPasses(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; This is a copy of sibling-call.ll, but stops after the IRTranslator. define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}store_fi_lifetime: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -2,10 +2,10 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll --- a/llvm/test/CodeGen/AMDGPU/extload-private.ll +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s ; Test that non-entry function frame indices are expanded properly to diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -enable-ipra < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Kernels are not called, so there is no call preserved mask. ; GCN-LABEL: {{^}}kernel: diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -187,7 +187,6 @@ ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: SROA ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR optimizations @@ -458,7 +457,6 @@ ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: SROA ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis @@ -751,7 +749,6 @@ ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: SROA ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O2-NEXT: Scalar Evolution Analysis @@ -1054,7 +1051,6 @@ ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: SROA ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O3-NEXT: Scalar Evolution Analysis diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %in) #0 { ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo: @@ -2705,4 +2705,3 @@ } attributes #0 = { nounwind } - diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s define <2 x i16> @load_local_lo_v2i16_undeflo(ptr addrspace(3) %in) #0 { ; GFX900-LABEL: load_local_lo_v2i16_undeflo: diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Test calls when called by other callable functions rather than ; kernels. diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll --- a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck %s ; ; CFG flattening should use parallel-and mode to generate branch conditions and ; then merge if-regions with the same bodies. diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -63,8 +63,8 @@ ; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 } ; REMARK-NEXT: Function: test_kernel ; REMARK-NEXT: Args: -; REMARK-NEXT: - String: ' Dynamic Stack: -; REMARK-NEXT: - DynamicStack: 'False' +; REMARK-NEXT: - String: ' Dynamic Stack: +; REMARK-NEXT: - DynamicStack: 'False' ; REMARK-NEXT: .. ; REMARK-NEXT: --- !Analysis ; REMARK-NEXT: Pass: kernel-resource-usage @@ -179,7 +179,7 @@ ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39 ; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32 ; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10 -; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 64 +; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144 ; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True ; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8 ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0 @@ -187,7 +187,7 @@ ; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0 declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture readonly, i8, i64, i1 immarg) - + define amdgpu_kernel void @test_indirect_w_static_stack() !dbg !10 { %alloca = alloca <10 x i64>, align 16, addrspace(5) call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 40, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s target datalayout = "A5" ; FIXME: Why is this commuted only sometimes? diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s -; RxN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-MUBUF %s +; RxN: llc -march=amdgcn -mcpu=gfx906 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 < %s | FileCheck %s define i64 @i64_test(i64 %i) nounwind readnone { %loc = alloca i64 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_asm.ll.expected @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 < %s | FileCheck %s define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected @@ -70,10 +70,40 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, s33 +; CHECK-NEXT: s_mov_b32 s8, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_addk_i32 s32, 0x600 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: v_mov_b32_e32 v2, 3 +; CHECK-NEXT: v_mov_b32_e32 v3, 4 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s33, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: main: @@ -84,16 +114,31 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_addk_i32 s32, 0x600 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, x@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, x@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v2, 1 +; CHECK-NEXT: v_mov_b32_e32 v3, 2 +; CHECK-NEXT: v_mov_b32_e32 v4, 3 +; CHECK-NEXT: v_mov_b32_e32 v5, 4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 +; CHECK-NEXT: s_addk_i32 s32, 0xfa00 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected @@ -11,10 +11,40 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, s33 +; CHECK-NEXT: s_mov_b32 s8, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_addk_i32 s32, 0x600 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 2 +; CHECK-NEXT: v_mov_b32_e32 v2, 3 +; CHECK-NEXT: v_mov_b32_e32 v3, 4 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 +; CHECK-NEXT: .LBB0_2: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB0_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_addk_i32 s32, 0xfa00 +; CHECK-NEXT: s_mov_b32 s33, s8 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %1 = alloca i32, align 4, addrspace(5) %2 = alloca i32, align 4, addrspace(5) @@ -61,16 +91,31 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s6, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_addk_i32 s32, 0x600 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, x@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, x@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v2, 1 +; CHECK-NEXT: v_mov_b32_e32 v3, 2 +; CHECK-NEXT: v_mov_b32_e32 v4, 3 +; CHECK-NEXT: v_mov_b32_e32 v5, 4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:8 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:16 +; CHECK-NEXT: s_addk_i32 s32, 0xfa00 ; CHECK-NEXT: s_mov_b32 s33, s6 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -stop-after=finalize-isel -debug-only=isel -o /dev/null %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -stop-after=finalize-isel -O3 -debug-only=isel -o /dev/null %s 2>&1 | FileCheck %s define i64 @i64_test(i64 %i) nounwind readnone { %loc = alloca i64 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -stop-after=finalize-isel -debug-only=isel -o /dev/null %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -stop-after=finalize-isel -O3 -debug-only=isel -o /dev/null %s 2>&1 | FileCheck %s define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: