Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -945,14 +945,14 @@ bool isLegalToVectorizeStore(StoreInst *SI) const; /// \returns True if it is legal to vectorize the given load chain. - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; + const DataLayout &DL) const; /// \returns True if it is legal to vectorize the given store chain. - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; + const DataLayout &DL) const; /// \returns The new vector factor value if the target doesn't support \p /// SizeInBytes loads or has a better vector factor. @@ -1166,12 +1166,14 @@ virtual unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const = 0; virtual bool isLegalToVectorizeLoad(LoadInst *LI) const = 0; virtual bool isLegalToVectorizeStore(StoreInst *SI) const = 0; - virtual bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + virtual bool isLegalToVectorizeLoadChain(Value *BasePtr, + unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const = 0; - virtual bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + const DataLayout &DL) const = 0; + virtual bool isLegalToVectorizeStoreChain(Value *BasePtr, + unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const = 0; + const DataLayout &DL) const = 0; virtual unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; @@ -1549,17 +1551,17 @@ bool isLegalToVectorizeStore(StoreInst *SI) const override { return Impl.isLegalToVectorizeStore(SI); } - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const override { - return Impl.isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, - AddrSpace); + const DataLayout &DL) const override { + return Impl.isLegalToVectorizeLoadChain(BasePtr, ChainSizeInBytes, + Alignment, DL); } - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const override { - return Impl.isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment, - AddrSpace); + const DataLayout &DL) const override { + return Impl.isLegalToVectorizeStoreChain(BasePtr, ChainSizeInBytes, + Alignment, DL); } unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -538,15 +538,15 @@ bool isLegalToVectorizeStore(StoreInst *SI) const { return true; } - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { + const DataLayout &DL) const { return true; } - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { + const DataLayout &DL) const { return true; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -635,15 +635,17 @@ } bool TargetTransformInfo::isLegalToVectorizeLoadChain( - unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const { - return TTIImpl->isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, - AddrSpace); + Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, + const DataLayout &DL) const { + return TTIImpl->isLegalToVectorizeLoadChain(BasePtr, ChainSizeInBytes, + Alignment, DL); } bool TargetTransformInfo::isLegalToVectorizeStoreChain( - unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const { - return TTIImpl->isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment, - AddrSpace); + Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, + const DataLayout &DL) const { + return TTIImpl->isLegalToVectorizeStoreChain(BasePtr, ChainSizeInBytes, + Alignment, DL); } unsigned TargetTransformInfo::getLoadVectorFactor(unsigned VF, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -145,15 +145,15 @@ VectorType *VecTy) const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; - bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeMemChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + const DataLayout &DL) const; + bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + const DataLayout &DL) const; + bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; + const DataLayout &DL) const; unsigned getMaxInterleaveFactor(unsigned VF); @@ -227,14 +227,15 @@ unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; - bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const; - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeMemChain(Value *BasePtr, unsigned ChainSizeInBytes, + unsigned Alignment, + const DataLayout &DL) const; + bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes, + unsigned Alignment, + const DataLayout &DL) const; + bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const; + const DataLayout &DL) const; unsigned getMaxInterleaveFactor(unsigned VF); unsigned getCFInstrCost(unsigned Opcode); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -269,29 +269,49 @@ llvm_unreachable("unhandled address space"); } -bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const { +bool GCNTTIImpl::isLegalToVectorizeMemChain(Value *BasePtr, + unsigned ChainSizeInBytes, + unsigned Alignment, + const DataLayout &DL) const { + auto Ty = cast(BasePtr->getType()); + unsigned AS = Ty->getAddressSpace(); + // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + if (AS == AMDGPUAS::PRIVATE_ADDRESS) { return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } + + // SI has a hardware bug in the LDS / GDS bounds checking: if the base address + // is negative, then the instruction is incorrectly treated as out-of-bounds + // even if base + offset is in bounds. This affects the high words of a + // vectorized load / store. + if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)) { + if (ChainSizeInBytes <= Alignment) + return true; + + KnownBits Bits = computeKnownBits(BasePtr, DL); + return Bits.isNonNegative(); + } + return true; } -bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const { - return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +bool GCNTTIImpl::isLegalToVectorizeLoadChain(Value *BasePtr, + unsigned ChainSizeInBytes, + unsigned Alignment, + const DataLayout &DL) const { + return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL); } -bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, - unsigned Alignment, - unsigned AddrSpace) const { - return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +bool GCNTTIImpl::isLegalToVectorizeStoreChain(Value *BasePtr, + unsigned ChainSizeInBytes, + unsigned Alignment, + const DataLayout &DL) const { + return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL); } unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { @@ -658,25 +678,29 @@ llvm_unreachable("unhandled address space"); } -bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, +bool R600TTIImpl::isLegalToVectorizeMemChain(Value *BasePtr, + unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { + const DataLayout &DL) const { // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. - return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); + unsigned AS = cast(BasePtr->getType())->getAddressSpace(); + return AS != AMDGPUAS::PRIVATE_ADDRESS; } -bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, +bool R600TTIImpl::isLegalToVectorizeLoadChain(Value *BasePtr, + unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { - return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); + const DataLayout &DL) const { + return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL); } -bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, +bool R600TTIImpl::isLegalToVectorizeStoreChain(Value *BasePtr, + unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { - return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); + const DataLayout &DL) const { + return isLegalToVectorizeMemChain(BasePtr, ChainSizeInBytes, Alignment, DL); } unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -51,15 +51,16 @@ // Loads and stores can be vectorized if the alignment is at least as big as // the load/store we want to vectorize. - bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeLoadChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { + const DataLayout &DL) const { return Alignment >= ChainSizeInBytes; } - bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + bool isLegalToVectorizeStoreChain(Value *BasePtr, unsigned ChainSizeInBytes, unsigned Alignment, - unsigned AddrSpace) const { - return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); + const DataLayout &DL) const { + return isLegalToVectorizeLoadChain(BasePtr, ChainSizeInBytes, Alignment, + DL); } // NVPTX has infinite registers of all kinds, but the actual machine doesn't. Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -999,7 +999,8 @@ Alignment = NewAlign; } - if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { + if (!TTI.isLegalToVectorizeStoreChain(S0->getPointerOperand(), SzInBytes, + Alignment, DL)) { auto Chains = splitOddVectorElts(Chain, Sz); return vectorizeStoreChain(Chains.first, InstructionsProcessed) | vectorizeStoreChain(Chains.second, InstructionsProcessed); @@ -1143,7 +1144,8 @@ Alignment = NewAlign; } - if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { + if (!TTI.isLegalToVectorizeLoadChain(L0->getPointerOperand(), SzInBytes, + Alignment, DL)) { auto Chains = splitOddVectorElts(Chain, Sz); return vectorizeLoadChain(Chains.first, InstructionsProcessed) | vectorizeLoadChain(Chains.second, InstructionsProcessed); Index: test/Transforms/LoadStoreVectorizer/AMDGPU/ds-bounds.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/AMDGPU/ds-bounds.ll @@ -0,0 +1,72 @@ +; RUN: opt -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ALL,SI %s +; RUN: opt -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ALL,NONSI %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32:32:32:16-p3:32:32:32:32:16-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +@compute_lds = external addrspace(3) global [512 x i32], align 16 + +; ALL-LABEL: @store_aligned( +; ALL: store <2 x i32> , <2 x i32> addrspace(3)* %0, align 8 +define amdgpu_cs void @store_aligned(i32 addrspace(3)* %ptr) #0 { +entry: + %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1 + + store i32 42, i32 addrspace(3)* %ptr, align 8 + store i32 43, i32 addrspace(3)* %ptr.gep.1 + ret void +} + + +; ALL-LABEL: @store_global_const_idx( +; +; TODO: Addresses are known-positive, this could be merged! +; SI: store i32 +; SI: store i32 +; +; NONSI: store <2 x i32> , <2 x i32> addrspace(3)* %0, align 4 +define amdgpu_cs void @store_global_const_idx() #0 { +entry: + %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3 + %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4 + + store i32 42, i32 addrspace(3)* %ptr.a + store i32 43, i32 addrspace(3)* %ptr.b + ret void +} + + +; ALL-LABEL: @store_global_var_idx_case1( +; SI: store i32 +; SI: store i32 +; NONSI: store <2 x i32> , <2 x i32> addrspace(3)* %0, align 4 +define amdgpu_cs void @store_global_var_idx_case1(i32 %idx) #0 { +entry: + %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx + %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1 + + store i32 42, i32 addrspace(3)* %ptr.a + store i32 43, i32 addrspace(3)* %ptr.b + ret void +} + + +; ALL-LABEL: @store_global_var_idx_case2( +; +; TODO: Addresses are known-positive, this could be merged! +; SI: store i32 +; SI: store i32 +; +; NONSI: store <2 x i32> , <2 x i32> addrspace(3)* %0, align 4 +define amdgpu_cs void @store_global_var_idx_case2(i32 %idx) #0 { +entry: + %idx.and = and i32 %idx, 255 + %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and + %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1 + + store i32 42, i32 addrspace(3)* %ptr.a + store i32 43, i32 addrspace(3)* %ptr.b + ret void +} + + +attributes #0 = { nounwind } Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=verde -load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,SI %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,NONSI %s ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -483,7 +484,9 @@ } ; CHECK-LABEL: @merge_local_store_2_constants_i32 -; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 +; SI: store i32 +; SI: store i32 +; NONSI: store <2 x i32> , <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 @@ -504,7 +507,11 @@ } ; CHECK-LABEL: @merge_local_store_4_constants_i32 -; CHECK: store <4 x i32> , <4 x i32> addrspace(3)* +; SI: store i32 +; SI: store i32 +; SI: store i32 +; SI: store i32 +; NONSI: store <4 x i32> , <4 x i32> addrspace(3)* define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -mcpu=bonaire -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" Index: test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" Index: test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"