Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -320,8 +320,7 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", "FlatForGlobal", "true", - "Force to generate flat instruction for global", - [FeatureNoAddr64] + "Force to generate flat instruction for global" >; // Dummy feature used to disable assembler instructions. Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -203,7 +203,7 @@ } bool hasAddr64() const { - return (getGeneration() < VOLCANIC_ISLANDS); + return !NoAddr64; } bool hasBFE() const { Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -49,6 +49,13 @@ ParseSubtargetFeatures(GPU, FullFS); + // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es + // on VI and newer hardware to avoid assertion failures due to missing ADDR64 + // variants of MUBUF instructions. + if (!hasAddr64() && !FS.contains("flat-for-global")) { + FlatForGlobal = true; + } + // FIXME: I don't think think Evergreen has any useful support for // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? Index: test/CodeGen/AMDGPU/ci-use-flat-for-global.ll =================================================================== --- test/CodeGen/AMDGPU/ci-use-flat-for-global.ll +++ test/CodeGen/AMDGPU/ci-use-flat-for-global.ll @@ -1,7 +1,9 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=HSA-NOADDR64 -check-prefix=ALL %s ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s ; There are no stack objects even though flat is used by default, so @@ -16,11 +18,37 @@ ; HSA-DEFAULT: flat_store_dword ; HSA-NODEFAULT: buffer_store_dword +; HSA-NOADDR64: flat_store_dword ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword +; NOHSA-NOADDR64: flat_store_dword define void @test(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void } + +; HSA-DEFAULT: flat_store_dword +; HSA-NODEFAULT: buffer_store_dword +; HSA-NOADDR64: flat_store_dword + +; NOHSA-DEFAULT: buffer_store_dword +; NOHSA-NODEFAULT: flat_store_dword +; NOHSA-NOADDR64: flat_store_dword +define void @test_addr64(i32 addrspace(1)* %out) { +entry: + %out.addr = alloca i32 addrspace(1)*, align 4 + + store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 4 + %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4 + + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0 + store i32 1, i32 addrspace(1)* %arrayidx, align 4 + + %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1 + store i32 2, i32 addrspace(1)* %arrayidx1, align 4 + + ret void +}