Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -109,6 +109,7 @@ SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; unsigned isCFIntrinsic(const SDNode *Intr) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -579,6 +579,26 @@ return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; } +bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { + if (Subtarget->hasFlatGlobalInsts()) + return isInt<13>(AM.BaseOffs) && AM.Scale == 0; + + if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM); + } + + return isLegalMUBUFAddressingMode(AM); +} + bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and // additionally can do r + r + i with addr64. 32-bit has more addressing @@ -621,22 +641,10 @@ if (AM.BaseGV) return false; - if (AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - // Assume the we will use FLAT for all global memory accesses - // on VI. - // FIXME: This assumption is currently wrong. On VI we still use - // MUBUF instructions for the r + i addressing mode. As currently - // implemented, the MUBUF instructions only work on buffer < 4GB. - // It may be possible to support > 4GB buffers with MUBUF instructions, - // by setting the stride value in the resource descriptor which would - // increase the size limit to (stride * 4GB). However, this is risky, - // because it has never been validated. - return isLegalFlatAddressingMode(AM); - } + if (AS == AMDGPUASI.GLOBAL_ADDRESS) + return isLegalGlobalAddressingMode(AM); - return isLegalMUBUFAddressingMode(AM); - } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -648,7 +656,7 @@ // FIXME?: We also need to do this if unaligned, but we don't know the // alignment here. if (DL.getTypeStoreSize(Ty) < 4) - return isLegalMUBUFAddressingMode(AM); + return isLegalGlobalAddressingMode(AM); if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,11 @@ -; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s -; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s -; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI -check-prefix=OPT-SICIVI %s +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-SICIVI %s +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-SICIVI %s +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" @@ -14,7 +16,6 @@ ; OPT-CI: getelementptr i8, ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: -; GCN: {{^}}BB0_2: define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 @@ -42,7 +43,8 @@ ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} ; GCN: {{^}}BB1_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -69,7 +71,8 @@ ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} +; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} +; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:4095{{$}} ; GCN: {{^}}BB2_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -96,7 +99,8 @@ ; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}} ; GCN: {{^}}BB3_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -673,6 +677,67 @@ ret void } +; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset( +; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 +; OPT-SICIV: br +; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep + +; OPT-GFX9: br +; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 +; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr + +; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset: +; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}} +define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset( +; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 +; OPT: br +; OPT: load i8, i8 addrspace(1)* %in.gep + +; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset: +define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097 + %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 + %tmp0 = icmp eq i32 %tid, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2