diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -944,6 +944,8 @@ return true; } + bool useAA() const override; + bool enableSubRegLiveness() const override { return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -50,6 +50,10 @@ cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt UseAA("amdgpu-use-aa-in-codegen", + cl::desc("Enable the use of AA during codegen."), + cl::init(true)); + GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & @@ -599,6 +603,8 @@ return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); } +bool GCNSubtarget::useAA() const { return UseAA; } + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) return getMaxWavesPerEU(); diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll --- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -1,13 +1,14 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s -; There is no dependence between the store and the two loads. So we can combine the loads -; and the combined load is at the original place of the second load. +; There is no dependence between the store and the two loads. So we can combine +; the loads and schedule it freely. ; GCN-LABEL: {{^}}ds_combine_nodep -; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 -; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 +; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 +; GCN: s_waitcnt lgkmcnt({{[0-9]+}}) define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) { %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -54,8 +54,8 @@ ; uniform load dominated by no-alias store - scalarize ; CHECK-LABEL: @no_memdep_alias_arg -; CHECK: flat_store_dword -; CHECK: s_load_dword [[SVAL:s[0-9]+]] +; CHECK: s_load_dwordx2 s{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0 +; CHECK: s_load_dword [[SVAL:s[0-9]+]], s{{\[}}[[IN_LO]]:[[IN_HI]]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1645,8 +1645,8 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b32 s2, 0xffff ; CI-NEXT: s_mov_b32 s3, 0 +; CI-NEXT: s_mov_b32 s2, 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 ; CI-NEXT: s_and_b32 s4, s4, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -7,23 +7,23 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dword v0, v[1:2], off offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 -; CHECK-NEXT: global_load_dword v0, v[1:2], off offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; CHECK-NEXT: global_load_dword v11, v[1:2], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12 +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [40 x i8], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -122,38 +122,26 @@ ; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56 ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16 +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32 +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60 +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll --- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll +++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll @@ -132,8 +132,8 @@ ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:1 ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:2 - -; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELTGE8-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELTGE8: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]] ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} @@ -141,8 +141,9 @@ ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -168,8 +169,8 @@ ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24 - -; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELTGE8-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELTGE8: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]] ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} @@ -177,8 +178,9 @@ ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s ; We expect a two digit VGPR usage here, not a three digit. ; CHECK: NumVgprs: {{[0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s -; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s -; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s -; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s +; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s +; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s +; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s ; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-amdgpu-aa=0 -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s ; We expect a three digit VGPR usage here since only one wave requested. ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -47,9 +47,9 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; CHECK-NEXT: BB1_1: ; %bb9 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1