diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1423,8 +1423,21 @@ // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ // ds_write2_b32 depending on the alignment. In either case with either // alignment there is no faster way of doing this. + + // The numbers returned here and below are not additive, it is a 'speed + // rank'. They are just meant to be compared to decide if a certain way + // of lowering an operation is faster than another. For that purpose + // naturally aligned operation gets it bitsize to indicate that "it + // operates with a speed comparable to N-bit wide load". With the full + // alignment ds128 is slower than ds96 for example. If underaligned it + // is comparable to a speed of a single dword access, which would then + // mean 32 < 128 and it is faster to issue a wide load regardless. + // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a + // wider load which will not be aligned anymore the latter is slower. if (IsFast) - *IsFast = 1; + *IsFast = (Alignment >= RequiredAlignment) ? 64 + : (Alignment < Align(4)) ? 32 + : 1; return true; } @@ -1442,8 +1455,12 @@ // be equally slow as a single ds_read_b96/ds_write_b96, but there will // be more of them, so overall we will pay less penalty issuing a single // instruction. + + // See comment on the values above. if (IsFast) - *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4); + *IsFast = (Alignment >= RequiredAlignment) ? 96 + : (Alignment < Align(4)) ? 32 + : 1; return true; } @@ -1463,8 +1480,12 @@ // be equally slow as a single ds_read_b128/ds_write_b128, but there // will be more of them, so overall we will pay less penalty issuing a // single instruction. + + // See comment on the values above. if (IsFast) - *IsFast= Alignment >= RequiredAlignment || Alignment < Align(4); + *IsFast = (Alignment >= RequiredAlignment) ? 128 + : (Alignment < Align(4)) ? 32 + : 1; return true; } @@ -1476,8 +1497,11 @@ break; } + // See comment on the values above. + // Note that we have a single-dword or sub-dword here, so if underaligned + // it is a slowest possible access, hence returned value is 0. if (IsFast) - *IsFast = Alignment >= RequiredAlignment; + *IsFast = (Alignment >= RequiredAlignment) ? Size : 0; return Alignment >= RequiredAlignment || Subtarget->hasUnalignedDSAccessEnabled(); @@ -1535,22 +1559,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *IsFast) const { - bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, - Alignment, Flags, IsFast); - - if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() && - (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || - AddrSpace == AMDGPUAS::REGION_ADDRESS)) { - // Lie it is fast if +unaligned-access-mode is passed so that DS accesses - // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a - // misaligned data which is faster than a pair of ds_read_b*/ds_write_b* - // which would be equally misaligned. - // This is only used by the common passes, selection always calls the - // allowsMisalignedMemoryAccessesImpl version. - *IsFast= 1; - } - - return Allow; + return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, + Alignment, Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -8785,7 +8795,7 @@ auto Flags = Load->getMemOperand()->getFlags(); if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, Load->getAlign(), Flags, &Fast) && - Fast) + Fast > 1) return SDValue(); if (MemVT.isVector()) @@ -9284,7 +9294,7 @@ auto Flags = Store->getMemOperand()->getFlags(); if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, Store->getAlign(), Flags, &Fast) && - Fast) + Fast > 1) return SDValue(); if (VT.isVector()) diff --git a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll @@ -0,0 +1,107 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefix=GCN %s + +; Check that vectorizer does not create slow misaligned loads + +; GCN-LABEL: {{^}}ds1align1: +; GCN-COUNT-2: ds_read_u8 +; GCN-COUNT-2: ds_write_b8 +define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { + %val1 = load i8, i8 addrspace(3)* %in, align 1 + %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1 + %val2 = load i8, i8 addrspace(3)* %gep1, align 1 + store i8 %val1, i8 addrspace(3)* %out, align 1 + %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1 + store i8 %val2, i8 addrspace(3)* %gep2, align 1 + ret void +} + +; GCN-LABEL: {{^}}ds2align2: +; GCN-COUNT-2: ds_read_u16 +; GCN-COUNT-2: ds_write_b16 +define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { + %val1 = load i16, i16 addrspace(3)* %in, align 2 + %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1 + %val2 = load i16, i16 addrspace(3)* %gep1, align 2 + store i16 %val1, i16 addrspace(3)* %out, align 2 + %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1 + store i16 %val2, i16 addrspace(3)* %gep2, align 2 + ret void +} + +; GCN-LABEL: {{^}}ds4align4: +; GCN: ds_read2_b32 +; GCN: ds_write2_b32 +define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + %val1 = load i32, i32 addrspace(3)* %in, align 4 + %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1 + %val2 = load i32, i32 addrspace(3)* %gep1, align 4 + store i32 %val1, i32 addrspace(3)* %out, align 4 + %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + store i32 %val2, i32 addrspace(3)* %gep2, align 4 + ret void +} + +; GCN-LABEL: {{^}}ds8align8: +; GCN: ds_read2_b64 +; GCN: ds_write2_b64 +define amdgpu_kernel void @ds8align8(i64 addrspace(3)* %in, i64 addrspace(3)* %out) { + %val1 = load i64, i64 addrspace(3)* %in, align 8 + %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1 + %val2 = load i64, i64 addrspace(3)* %gep1, align 8 + store i64 %val1, i64 addrspace(3)* %out, align 8 + %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1 + store i64 %val2, i64 addrspace(3)* %gep2, align 8 + ret void +} + +; GCN-LABEL: {{^}}ds1align2: +; GCN: ds_read_u16 +; GCN: ds_write_b16 +define amdgpu_kernel void @ds1align2(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { + %val1 = load i8, i8 addrspace(3)* %in, align 2 + %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1 + %val2 = load i8, i8 addrspace(3)* %gep1, align 2 + store i8 %val1, i8 addrspace(3)* %out, align 2 + %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1 + store i8 %val2, i8 addrspace(3)* %gep2, align 2 + ret void +} + +; GCN-LABEL: {{^}}ds2align4: +; GCN: ds_read_b32 +; GCN: ds_write_b32 +define amdgpu_kernel void @ds2align4(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { + %val1 = load i16, i16 addrspace(3)* %in, align 4 + %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1 + %val2 = load i16, i16 addrspace(3)* %gep1, align 4 + store i16 %val1, i16 addrspace(3)* %out, align 4 + %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1 + store i16 %val2, i16 addrspace(3)* %gep2, align 4 + ret void +} + +; GCN-LABEL: {{^}}ds4align8: +; GCN: ds_read_b64 +; GCN: ds_write_b64 +define amdgpu_kernel void @ds4align8(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + %val1 = load i32, i32 addrspace(3)* %in, align 8 + %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1 + %val2 = load i32, i32 addrspace(3)* %gep1, align 8 + store i32 %val1, i32 addrspace(3)* %out, align 8 + %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + store i32 %val2, i32 addrspace(3)* %gep2, align 8 + ret void +} + +; GCN-LABEL: {{^}}ds8align16: +; GCN: ds_read_b128 +; GCN: ds_write_b128 +define amdgpu_kernel void @ds8align16(i64 addrspace(3)* %in, i64 addrspace(3)* %out) { + %val1 = load i64, i64 addrspace(3)* %in, align 16 + %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1 + %val2 = load i64, i64 addrspace(3)* %gep1, align 16 + store i64 %val1, i64 addrspace(3)* %out, align 16 + %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1 + store i64 %val2, i64 addrspace(3)* %gep2, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -312,9 +312,10 @@ ; GFX11-LABEL: tied_operand_test: ; GFX11: ; %bb.0: ; %entry -; GFX11: scratch_load_d16_hi_b16 [[LDRESULT:v[0-9]+]], off, off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ds_store_b32 v{{[0-9]+}}, [[LDRESULT]] offset:8 +; GFX11-DAG: scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4 +; GFX11-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7b +; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[LDRESULT]] offset:10 +; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[C]] offset:8 ; GFX11-NEXT: s_endpgm define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) { entry: