Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -78,6 +78,13 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -129,6 +129,25 @@ } } +bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + // We allow vectorization of flat stores, even though we may need to decompose + // them later if they may access private memory. We don't have enough context + // here, and leglization can handle it. + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + } + return true; +} + +bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); +} + unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -432,9 +432,12 @@ unsigned ElementSizeBytes = ElementSizeBits / 8; unsigned SizeBytes = ElementSizeBytes * Chain.size(); unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; - if (NumLeft == Chain.size()) - --NumLeft; - else if (NumLeft == 0) + if (NumLeft == Chain.size()) { + if ((NumLeft & 1) == 0) + NumLeft /= 2; // Split even in half + else + --NumLeft; // Split off last element + } else if (NumLeft == 0) NumLeft = 1; return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } Index: test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -64,7 +64,10 @@ ; ALL: alloca [128 x i32], align 16 ; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}} + +; FIXME: Should change alignment +; ALIGNED: load i32 +; ALIGNED: load i32 define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 16 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -1,8 +1,9 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" @@ -28,6 +29,60 @@ ret void } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1( +; ALIGNED: store i32 9, i32* %out, align 1 +; ALIGNED: store i32 1, i32* %out.gep.1, align 1 +; ALIGNED: store i32 23, i32* %out.gep.2, align 1 +; ALIGNED: store i32 19, i32* %out.gep.3, align 1 + +; ELT16-UNALIGNED: store <4 x i32> , <4 x i32>* %1, align 1 + +; ELT8-UNALIGNED: store <2 x i32> , <2 x i32>* %1, align 1 +; ELT8-UNALIGNED: store <2 x i32> , <2 x i32>* %2, align 1 + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + %out.gep.3 = getelementptr i32, i32* %out, i32 3 + + store i32 9, i32* %out, align 1 + store i32 1, i32* %out.gep.1, align 1 + store i32 23, i32* %out.gep.2, align 1 + store i32 19, i32* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( +; ALIGNED: store i32 9, i32* %out, align 2 +; ALIGNED: store i32 1, i32* %out.gep.1, align 2 +; ALIGNED: store i32 23, i32* %out.gep.2, align 2 +; ALIGNED: store i32 19, i32* %out.gep.3, align 2 + +; ELT16-UNALIGNED: store <4 x i32> , <4 x i32>* %1, align 2 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store <2 x i32> + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + %out.gep.3 = getelementptr i32, i32* %out, i32 3 + + store i32 9, i32* %out, align 2 + store i32 1, i32* %out.gep.1, align 2 + store i32 23, i32* %out.gep.2, align 2 + store i32 19, i32* %out.gep.3, align 2 + ret void +} + ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( ; ALL: store <4 x i8> define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { @@ -42,6 +97,25 @@ ret void } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1( +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 + +; UNALIGNED: store <4 x i8> , <4 x i8>* %1, align 1 +define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 { + %out.gep.1 = getelementptr i8, i8* %out, i32 1 + %out.gep.2 = getelementptr i8, i8* %out, i32 2 + %out.gep.3 = getelementptr i8, i8* %out, i32 3 + + store i8 9, i8* %out, align 1 + store i8 1, i8* %out.gep.1, align 1 + store i8 23, i8* %out.gep.2, align 1 + store i8 19, i8* %out.gep.3, align 1 + ret void +} + ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( ; ALL: store <2 x i16> define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { @@ -52,4 +126,98 @@ ret void } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2( +; ALIGNED: store i16 +; ALIGNED: store i16 + +; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 2 +define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out, align 2 + store i16 12, i16* %out.gep.1, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1( +; ALIGNED: store i16 +; ALIGNED: store i16 + +; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 1 +define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out, align 1 + store i16 12, i16* %out.gep.1, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( +; ALL: store <2 x i16> , <2 x i16>* %1, align 8 +define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out, align 8 + store i16 12, i16* %out.gep.1, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32 +; ELT4: store i32 +; ELT4: store i32 +; ELT4: store i32 + +; ELT8: store <2 x i32> +; ELT8: store i32 + +; ELT16: store <3 x i32> +define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + + store i32 9, i32* %out + store i32 1, i32* %out.gep.1 + store i32 23, i32* %out.gep.2 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1( +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store i32 + +; ELT16-UNALIGNED: store <3 x i32> +define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + + store i32 9, i32* %out, align 1 + store i32 1, i32* %out.gep.1, align 1 + store i32 23, i32* %out.gep.2, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1( +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 + +; UNALIGNED: store <3 x i8> +define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 { + %out.gep.1 = getelementptr i8, i8* %out, i8 1 + %out.gep.2 = getelementptr i8, i8* %out, i8 2 + + store i8 9, i8* %out, align 1 + store i8 1, i8* %out.gep.1, align 1 + store i8 23, i8* %out.gep.2, align 1 + ret void +} + attributes #0 = { nounwind }