Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -584,61 +584,54 @@ unsigned AddrSpace, unsigned Align, bool *IsFast) const { + // If the access is allowed, it is expected to be similarly fast as an aligned + // access in all cases except one. if (IsFast) - *IsFast = false; + *IsFast = true; // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, // which isn't a simple VT. // Until MVT is extended to handle this, simply check for the size and // rely on the condition below: allow accesses if the size is a multiple of 4. - if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && - VT.getStoreSize() > 16)) { + if (VT == MVT::Other) return false; - } + + // 4-byte alignment is always good in all address spaces. + // + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. + bool AlignedBy4 = (Align % 4 == 0); + if (AlignedBy4) + return true; if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. - bool AlignedBy4 = (Align % 4 == 0); - if (IsFast) - *IsFast = AlignedBy4; - - return AlignedBy4; + // For local/region, ds_read/write_b64 require 8-byte alignment, but we can + // do a 4 byte aligned, 8 byte access in a single operation using + // ds_read2/write2_b32 with adjacent offsets. + return false; } - // FIXME: We have to be conservative here and assume that flat operations - // will access scratch. If we had access to the IR function, then we - // could determine if any private memory was used in the function. if (!Subtarget->hasUnalignedScratchAccess() && (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + // FIXME: We have to be conservative here and assume that flat operations + // will access scratch. If we had access to the IR function, then we could + // determine if any private memory was used in the function. return false; } if (Subtarget->hasUnalignedBufferAccess()) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. - if (IsFast) { - *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? - (Align % 4 == 0) : true; - } - + if (IsFast) + *IsFast = (AddrSpace != AMDGPUAS::CONSTANT_ADDRESS); return true; } // Smaller than dword value must be aligned. - if (VT.bitsLT(MVT::i32)) - return false; - - // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the - // byte-address are ignored, thus forcing Dword alignment. - // This applies to private, global, and constant memory. - if (IsFast) - *IsFast = true; - - return VT.bitsGT(MVT::i32) && Align % 4 == 0; + return false; } EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access,+unaligned-scratch-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s ; SI-LABEL: {{^}}local_unaligned_load_store_i16: @@ -601,4 +601,82 @@ ret void } +; SI-LABEL: {{^}}private_store_align4_v16i64: +; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; SI-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} + +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:12 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:16 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:20 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:24 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:28 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:32 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:36 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:40 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:44 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:48 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:52 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:56 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:60 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:64 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:68 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:68 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:72 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:76 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:80 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:84 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:88 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:92 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:96 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:100 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:104 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:108 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:112 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:116 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:120 +; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:124 +define void @private_store_align4_v16i64(<16 x i64>* %out) #0 { + store <16 x i64> zeroinitializer, <16 x i64>* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}private_store_align1_v2i16: +; UNALIGNED: buffer_store_dword + +; ALIGNED: buffer_store_byte +; ALIGNED: buffer_store_byte +; ALIGNED: buffer_store_byte +; ALIGNED: buffer_store_byte +define void @private_store_align1_v2i16(<2 x i16>* %out) #0 { + store <2 x i16> zeroinitializer, <2 x i16>* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}private_store_align2_v2i16: +; UNALIGNED: buffer_store_dword + +; ALIGNED: buffer_store_short +; ALIGNED: buffer_store_short +define void @private_store_align2_v2i16(<2 x i16>* %out) #0 { + store <2 x i16> zeroinitializer, <2 x i16>* %out, align 2 + ret void +} + +; SI-LABEL: {{^}}private_store_align4_v2i16: +; SI: buffer_store_dword +define void @private_store_align4_v2i16(<2 x i16>* %out) #0 { + store <2 x i16> zeroinitializer, <2 x i16>* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}private_store_align8_v2i16: +; SI: buffer_store_dword +define void @private_store_align8_v2i16(<2 x i16>* %out) #0 { + store <2 x i16> zeroinitializer, <2 x i16>* %out, align 8 + ret void +} + attributes #0 = { nounwind } Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -7,15 +7,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 - -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store <2 x i32> - -; ELT16-UNALIGNED: store <4 x i32> +; ALL-UNALIGNED: store <4 x i32> define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { %out.gep.1 = getelementptr i32, i32* %out, i32 1 %out.gep.2 = getelementptr i32, i32* %out, i32 2