diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -92,6 +92,11 @@ cl::init(true), cl::ReallyHidden); +static cl::opt CheckFlatLocalAlias( + "check-flat-local-alias", + cl::desc("Check whether a FLAT pointer could alias to local objects"), + cl::init(true), cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -2823,8 +2828,8 @@ return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); } -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb) const { +bool SIInstrInfo::areMemAccessesTriviallyDisjoint( + const MachineInstr &MIa, const MachineInstr &MIb) const { assert(MIa.mayLoadOrStore() && "MIa must load from or modify a memory location"); assert(MIb.mayLoadOrStore() && @@ -2837,6 +2842,32 @@ if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; + auto FlatMayAliasToLocalObject = [](const MachineInstr &MI) { + assert(isFLAT(MI) && !MI.memoperands_empty()); + if (!CheckFlatLocalAlias) + return true; + if (MI.memoperands_empty()) + return true; + for (auto *MMO : MI.memoperands()) { + if (MMO->getAddrSpace() != AMDGPUAS::FLAT_ADDRESS) + continue; + if (const auto *Ptr = MMO->getValue()) { + auto Obj = + getUnderlyingObject(Ptr->stripPointerCastsAndInvariantGroups()); + if (const LoadInst *LI = dyn_cast(Obj)) { + if (LI->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + continue; + } else if (const Argument *Arg = dyn_cast(Obj)) { + const Function *F = Arg->getParent(); + if (F && F->getCallingConv() == CallingConv::AMDGPU_KERNEL) + continue; + } + } + return true; + } + return false; + }; + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -2846,7 +2877,10 @@ if (isDS(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); + if (!isFLAT(MIb) || isSegmentSpecificFLAT(MIb)) + return true; + + return !FlatMayAliasToLocalObject(MIb); } if (isMUBUF(MIa) || isMTBUF(MIa)) { @@ -2867,6 +2901,9 @@ if (isFLAT(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); + if (isDS(MIb)) + return !FlatMayAliasToLocalObject(MIa); + return false; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,10 +1,10 @@ -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -disable-promote-alloca-to-vector -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE-VECT -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -show-mc-encoding --amdhsa-code-object-version=2 -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -check-flat-local-alias=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s diff --git a/llvm/test/CodeGen/AMDGPU/disjoint-memory-access.ll b/llvm/test/CodeGen/AMDGPU/disjoint-memory-access.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/disjoint-memory-access.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=COMMON,AMDGPUAA %s +; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=gfx906 -mattr=-load-store-opt -verify-machineinstrs -o - %s | FileCheck --check-prefixes=COMMON,NOAMDGPUAA %s + +@lds = internal addrspace(3) global [512 x float] undef, align 4 + +; COMMON-LABEL: test_0 + +; Once AA is enhanced to tell that `%out` won't alias to `@lds`, we should be +; able to schedule LDS/FLAT memory operations apart and merge them accordingly. +; AMDGPUAA-DAG: ds_read2 +; AMDGPUAA-DAG: ds_read +; AMDGPUAA: flat_store_dwordx3 + +; Without AA, we should still be able to schedule them apart. +; NOAMDGPUAA-COUNT-3: ds_read +; NOAMDGPUAA-COUNT-3: flat_store_dword +define amdgpu_kernel void @test_0(float* %out) { + %l0 = getelementptr [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 + %v0 = load float, float addrspace(3)* %l0 + %p0 = getelementptr float, float* %out, i64 0 + store float %v0, float* %p0 + %l1 = getelementptr [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1 + %v1 = load float, float addrspace(3)* %l1 + %p1 = getelementptr float, float* %out, i64 1 + store float %v1, float* %p1 + %l2 = getelementptr [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 + %v2 = load float, float addrspace(3)* %l2 + %p2 = getelementptr float, float* %out, i64 2 + store float %v2, float* %p2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -38,8 +38,6 @@ ; CHECK: successors: %bb.1(0x80000000) ; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead %11 ; CHECK: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) - ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3) ; CHECK: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def %15, 851978 /* regdef:VGPR_LO16 */, def %16 ; CHECK: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec @@ -47,6 +45,8 @@ ; CHECK: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def %21, 851978 /* regdef:VGPR_LO16 */, def %22 ; CHECK: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[V_MOV_B32_e32_2]], 851978 /* regdef:VGPR_LO16 */, def dead [[V_MOV_B32_e32_3]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_3]](tied-def 5), 851977 /* reguse:VGPR_LO16 */, %15, 851977 /* reguse:VGPR_LO16 */, %16, 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_LO16 */, [[DS_READ_B32_gfx9_2]] ; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] ; CHECK: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s ; We expect a two digit VGPR usage here, not a three digit. ; CHECK: NumVgprs: {{[0-9][0-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s -; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s -; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s -; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s ; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -check-flat-local-alias=0 -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s ; We expect a three digit VGPR usage here since only one wave requested. ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}