Index: lib/Target/NVPTX/NVPTXTargetTransformInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -49,6 +49,19 @@ return AddressSpace::ADDRESS_SPACE_GENERIC; } + // Loads and stores can be vectorized if the alignment is at least as big as + // the load/store we want to vectorize. + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return Alignment >= ChainSizeInBytes; + } + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace); + } + // NVPTX has infinite registers of all kinds, but the actual machine doesn't. // We conservatively return 1 here which is just enough to enable the // vectorizers but disables heuristics based on the number of registers. Index: test/CodeGen/NVPTX/vectorize-misaligned.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/vectorize-misaligned.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s | FileCheck %s +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: test1 +; CHECK: ld.global.v2.f32 +; CHECK: ld.global.v2.f32 +; CHECK: st.global.v2.f32 +; CHECK: st.global.v2.f32 +define void @test1(float addrspace(1)* noalias align 8 %in, float addrspace(1)* noalias align 8 %out) { + %in.1 = getelementptr float, float addrspace(1)* %in, i32 1 + %in.2 = getelementptr float, float addrspace(1)* %in, i32 2 + %in.3 = getelementptr float, float addrspace(1)* %in, i32 3 + %v0 = load float, float addrspace(1)* %in, align 8 + %v1 = load float, float addrspace(1)* %in.1, align 4 + %v2 = load float, float addrspace(1)* %in.2, align 8 + %v3 = load float, float addrspace(1)* %in.3, align 4 + %sum0 = fadd float %v0, %v1 + %sum1 = fadd float %v1, %v2 + %sum2 = fadd float %v3, %v1 + %sum3 = fadd float %v2, %v3 + %out.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.3 = getelementptr float, float addrspace(1)* %out, i32 3 + store float %sum0, float addrspace(1)* %out, align 8 + store float %sum1, float addrspace(1)* %out.1, align 4 + store float %sum2, float addrspace(1)* %out.2, align 8 + store float %sum3, float addrspace(1)* %out.3, align 4 + ret void +}