Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -389,7 +389,8 @@ bool isFPVectorizationPotentiallyUnsafe() const; /// \brief Determine if the target supports unaligned memory accesses. - bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace = 0, + bool allowsMisalignedMemoryAccesses(LLVMContext &Context, + unsigned BitWidth, unsigned AddressSpace = 0, unsigned Alignment = 1, bool *Fast = nullptr) const; @@ -668,7 +669,8 @@ virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; - virtual bool allowsMisalignedMemoryAccesses(unsigned BitWidth, + virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, + unsigned BitWidth, unsigned AddressSpace, unsigned Alignment, bool *Fast) = 0; @@ -841,9 +843,10 @@ bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } - bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace, + bool allowsMisalignedMemoryAccesses(LLVMContext &Context, + unsigned BitWidth, unsigned AddressSpace, unsigned Alignment, bool *Fast) override { - return Impl.allowsMisalignedMemoryAccesses(BitWidth, AddressSpace, + return Impl.allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace, Alignment, Fast); } PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override { Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -253,7 +253,8 @@ bool isFPVectorizationPotentiallyUnsafe() { return false; } - bool allowsMisalignedMemoryAccesses(unsigned BitWidth, + bool allowsMisalignedMemoryAccesses(LLVMContext &Context, + unsigned BitWidth, unsigned AddressSpace, unsigned Alignment, bool *Fast) { return false; } Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -105,10 +105,11 @@ /// \name Scalar TTI Implementations /// @{ - bool allowsMisalignedMemoryAccesses(unsigned BitWidth, unsigned AddressSpace, + bool allowsMisalignedMemoryAccesses(LLVMContext &Context, + unsigned BitWidth, unsigned AddressSpace, unsigned Alignment, bool *Fast) const { - MVT M = MVT::getIntegerVT(BitWidth); - return getTLI()->allowsMisalignedMemoryAccesses(M, AddressSpace, Alignment, Fast); + EVT E = EVT::getIntegerVT(Context, BitWidth); + return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast); } bool hasBranchDivergence() { return false; } Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -186,11 +186,12 @@ return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } -bool TargetTransformInfo::allowsMisalignedMemoryAccesses(unsigned BitWidth, +bool TargetTransformInfo::allowsMisalignedMemoryAccesses(LLVMContext &Context, + unsigned BitWidth, unsigned AddressSpace, unsigned Alignment, bool *Fast) const { - return TTIImpl->allowsMisalignedMemoryAccesses(BitWidth, AddressSpace, + return TTIImpl->allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace, Alignment, Fast); } Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -439,8 +439,12 @@ // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, // which isn't a simple VT. - if (!VT.isSimple() || VT == MVT::Other) + // Until MVT is extended to handle this, simply check for the size and + // rely on the condition below: allow accesses if the size is a multiple of 4. + if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && + VT.getStoreSize() > 16)) { return false; + } if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { Index: llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -40,9 +40,8 @@ namespace { -// TODO: Remove this -static const unsigned TargetBaseAlign = 4; - +// FIXME: Assuming stack alignment of 4 is always good enough +static const unsigned StackAdjustedAlignment = 4; typedef SmallVector InstrList; typedef MapVector InstrListMap; @@ -798,8 +797,8 @@ // so we can cheat and change it! Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL); if (AllocaInst *AI = dyn_cast_or_null(V)) { - AI->setAlignment(TargetBaseAlign); - Alignment = TargetBaseAlign; + AI->setAlignment(StackAdjustedAlignment); + Alignment = StackAdjustedAlignment; } else { return false; } @@ -948,8 +947,8 @@ // so we can cheat and change it! Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL); if (AllocaInst *AI = dyn_cast_or_null(V)) { - AI->setAlignment(TargetBaseAlign); - Alignment = TargetBaseAlign; + AI->setAlignment(StackAdjustedAlignment); + Alignment = StackAdjustedAlignment; } else { return false; } @@ -1029,10 +1028,13 @@ bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, unsigned Alignment) { + if (Alignment % SzInBytes == 0) + return false; bool Fast = false; - bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace, + bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), + SzInBytes * 8, AddressSpace, Alignment, &Fast); - // TODO: Remove TargetBaseAlign - return !(Allows && Fast) && (Alignment % SzInBytes) != 0 && - (Alignment % TargetBaseAlign) != 0; + DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows + << " and fast? " << Fast << "\n";); + return !Allows || !Fast; } Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll @@ -22,9 +22,9 @@ define void @load_fn(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn() - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -35,9 +35,9 @@ define void @load_fn_nounwind(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn_nounwind() #0 - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -48,9 +48,9 @@ define void @load_fn_nounwind_writeonly(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn_nounwind_writeonly() #1 - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -60,9 +60,9 @@ define void @load_fn_nounwind_readonly(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn_nounwind_readonly() #2 - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -73,9 +73,9 @@ define void @load_fn_readonly(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn_readonly() #4 - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -86,9 +86,9 @@ define void @load_fn_writeonly(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn_writeonly() #3 - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -98,9 +98,9 @@ define void @load_fn_readnone(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - %v0 = load i32, i32* %p + %v0 = load i32, i32* %p, align 8 call void @fn_readnone() #5 - %v1 = load i32, i32* %p.1 + %v1 = load i32, i32* %p.1, align 4 ret void } @@ -193,9 +193,9 @@ define void @store_fn_readnone(i32* %p) #0 { %p.1 = getelementptr i32, i32* %p, i32 1 - store i32 0, i32* %p + store i32 0, i32* %p, align 8 call void @fn_readnone() #5 - store i32 0, i32* %p.1 + store i32 0, i32* %p.1, align 8 ret void } Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll @@ -6,7 +6,7 @@ define void @foo() { ; CHECK: load <4 x float> - %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 4 + %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 16 %b = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 1), align 4 %c = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 2), align 4 %d = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 3), align 4 Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -17,8 +17,8 @@ define void @preserve_order_32(%struct.buffer_t* noalias %buff) #0 { entry: %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 1 - %buff.p = load i8*, i8** %tmp1, align 8 - %buff.val = load i8, i8* %buff.p, align 8 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p store i8 0, i8* %buff.p, align 8 %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 0 %buff.int = load i32, i32* %tmp0, align 8 Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -18,11 +18,11 @@ define void @preserve_order_64(%struct.buffer_t* noalias %buff) #0 { entry: %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 - %buff.p = load i8*, i8** %tmp1, align 8 - %buff.val = load i8, i8* %buff.p, align 8 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p store i8 0, i8* %buff.p, align 8 %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 - %buff.int = load i64, i64* %tmp0, align 8 + %buff.int = load i64, i64* %tmp0, align 16 ret void } @@ -36,12 +36,12 @@ entry: %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1 - %buff.p = load i8*, i8** %tmp1, align 8 - %buff.val = load i8, i8* %buff.p, align 8 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p store i8 0, i8* %buff.p, align 8 %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0 - %buff.int = load i64, i64* %tmp0, align 8 + %buff.int = load i64, i64* %tmp0, align 16 ret void } @@ -55,8 +55,8 @@ define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) { entry: %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 - %buff.p = load i8*, i8** %tmp1, align 8 - %buff.val = load i8, i8* %buff.p, align 8 + %buff.p = load i8*, i8** %tmp1 + %buff.val = load i8, i8* %buff.p store i8 0, i8* %buff.p, align 8 br label %"for something" @@ -64,7 +64,7 @@ %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 - %buff.int = load i64, i64* %tmp0, align 8 + %buff.int = load i64, i64* %tmp0, align 16 %index.next = add i64 %index, 8 %cmp_res = icmp eq i64 %index.next, 8 Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"