Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -67,6 +67,12 @@ "Support unaligned global loads and stores" >; +def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access", + "UnalignedScratchAccess", + "true", + "Support unaligned scratch loads and stores" +>; + def FeatureXNACK : SubtargetFeature<"xnack", "EnableXNACK", "true", Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ bool FP64Denormals; bool FPExceptions; bool FlatForGlobal; + bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool EnableXNACK; bool DebuggerInsertNops; @@ -275,6 +276,10 @@ return UnalignedBufferAccess; } + bool hasUnalignedScratchAccess() const { + return UnalignedScratchAccess; + } + bool isXNACKEnabled() const { return EnableXNACK; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -85,6 +85,7 @@ FP64Denormals(false), FPExceptions(false), FlatForGlobal(false), + UnalignedScratchAccess(false), UnalignedBufferAccess(false), EnableXNACK(false), Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -453,6 +453,15 @@ return AlignedBy4; } + // FIXME: We have to be conservative here and assume that flat operations + // will access scratch. If we had access to the IR function, then we + // could determine if any private memory was used in the function. + if (!Subtarget->hasUnalignedScratchAccess() && + (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + return false; + } + if (Subtarget->hasUnalignedBufferAccess()) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. Index: test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/flat-address-space.ll +++ test/CodeGen/AMDGPU/flat-address-space.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s ; Disable optimizations in case there are optimizations added that ; specialize away generic pointer accesses. @@ -73,7 +73,7 @@ ; CHECK: flat_load_dwordx2 define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - %fload = load i64, i64 addrspace(4)* %fptr, align 4 + %fload = load i64, i64 addrspace(4)* %fptr, align 8 store i64 %fload, i64 addrspace(1)* %out, align 8 ret void } @@ -82,7 +82,7 @@ ; CHECK: flat_load_dwordx4 define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4 + %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 ret void } @@ -127,6 +127,30 @@ ret void } +; CHECK-LABEL: flat_scratch_unaligned_load: +; CHECK: flat_load_ubyte +; CHECK: flat_load_ubyte +; CHECK: flat_load_ubyte +; CHECK: flat_load_ubyte +define void @flat_scratch_unaligned_load() { + %scratch = alloca i32 + %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* + %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1 + ret void +} + +; CHECK-LABEL: flat_scratch_unaligned_store: +; CHECK: flat_store_byte +; CHECK: flat_store_byte +; CHECK: flat_store_byte +; CHECK: flat_store_byte +define void @flat_scratch_unaligned_store() { + %scratch = alloca i32 + %fptr = addrspacecast i32* %scratch to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %fptr, align 1 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind convergent } attributes #3 = { nounwind readnone } Index: test/CodeGen/AMDGPU/scratch-buffer.ll =================================================================== --- test/CodeGen/AMDGPU/scratch-buffer.ll +++ test/CodeGen/AMDGPU/scratch-buffer.ll @@ -108,3 +108,25 @@ store i32 %val, i32 addrspace(1)* %out ret void } + +; GCN-LABEL: scratch_unaligned_load: +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @scratch_unaligned_load() { + %scratch = alloca i32 + %ld = load volatile i32, i32* %scratch, align 1 + ret void +} + +; GCN-LABEL: scratch_unaligned_store: +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +define void @scratch_unaligned_store() { + %scratch = alloca i32 + store volatile i32 0, i32 * %scratch, align 1 + ret void +}