Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -954,11 +954,6 @@ // try again. unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; - if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - return vectorizeStoreChain(Chains.first, InstructionsProcessed) | - vectorizeStoreChain(Chains.second, InstructionsProcessed); - } VectorType *VecTy; VectorType *VecStoreTy = dyn_cast(StoreTy); @@ -991,14 +986,23 @@ // If the store is going to be misaligned, don't vectorize it. if (accessIsMisaligned(SzInBytes, AS, Alignment)) { - if (S0->getPointerAddressSpace() != 0) - return false; + if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { + auto Chains = splitOddVectorElts(Chain, Sz); + return vectorizeStoreChain(Chains.first, InstructionsProcessed) | + vectorizeStoreChain(Chains.second, InstructionsProcessed); + } unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), StackAdjustedAlignment, DL, S0, nullptr, &DT); - if (NewAlign < StackAdjustedAlignment) - return false; + if (NewAlign != 0) + Alignment = NewAlign; + } + + if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { + auto Chains = splitOddVectorElts(Chain, Sz); + return vectorizeStoreChain(Chains.first, InstructionsProcessed) | + vectorizeStoreChain(Chains.second, InstructionsProcessed); } BasicBlock::iterator First, Last; @@ -1037,13 +1041,11 @@ } } - // This cast is safe because Builder.CreateStore() always creates a bona fide - // StoreInst. - StoreInst *SI = cast( - Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(), - VecTy->getPointerTo(AS)))); + StoreInst *SI = Builder.CreateAlignedStore( + Vec, + Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), + Alignment); propagateMetadata(SI, Chain); - SI->setAlignment(Alignment); eraseInstructions(Chain); ++NumVectorInstructions; @@ -1102,12 +1104,6 @@ // try again. unsigned EltSzInBytes = Sz / 8; unsigned SzInBytes = EltSzInBytes * ChainSize; - if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { - auto Chains = splitOddVectorElts(Chain, Sz); - return vectorizeLoadChain(Chains.first, InstructionsProcessed) | - vectorizeLoadChain(Chains.second, InstructionsProcessed); - } - VectorType *VecTy; VectorType *VecLoadTy = dyn_cast(LoadTy); if (VecLoadTy) @@ -1132,18 +1128,27 @@ // If the load is going to be misaligned, don't vectorize it. if (accessIsMisaligned(SzInBytes, AS, Alignment)) { - if (L0->getPointerAddressSpace() != 0) - return false; + if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { + auto Chains = splitOddVectorElts(Chain, Sz); + return vectorizeLoadChain(Chains.first, InstructionsProcessed) | + vectorizeLoadChain(Chains.second, InstructionsProcessed); + } unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT); - if (NewAlign < StackAdjustedAlignment) - return false; + if (NewAlign != 0) + Alignment = NewAlign; Alignment = NewAlign; } + if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { + auto Chains = splitOddVectorElts(Chain, Sz); + return vectorizeLoadChain(Chains.first, InstructionsProcessed) | + vectorizeLoadChain(Chains.second, InstructionsProcessed); + } + LLVM_DEBUG({ dbgs() << "LSV: Loads to vectorize:\n"; for (Instruction *I : Chain) @@ -1159,11 +1164,8 @@ Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); - // This cast is safe because Builder.CreateLoad always creates a bona fide - // LoadInst. - LoadInst *LI = cast(Builder.CreateLoad(Bitcast)); + LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment); propagateMetadata(LI, Chain); - LI->setAlignment(Alignment); if (VecLoadTy) { SmallVector InstrsToErase; Index: test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -64,10 +64,7 @@ ; ALL: alloca [128 x i32], align 16 ; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; FIXME: Should change alignment -; ALIGNED: load i32 -; ALIGNED: load i32 +; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}} define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 16, addrspace(5) %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset @@ -128,5 +125,84 @@ ret void } -attributes #0 = { nounwind } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32( +; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) +; ALIGNED: store <4 x i32> , <4 x i32> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) +; UNALIGNED: store <4 x i32> , <4 x i32> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() { + %alloca = alloca [8 x i32], align 1, addrspace(5) + %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 + store i32 19, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( +; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) +; ALIGNED: store <4 x i8> , <4 x i8> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) +; UNALIGNED: store <4 x i8> , <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() { + %alloca = alloca [8 x i8], align 1, addrspace(5) + %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 + + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 + store i8 19, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} +; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32( +; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) +; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) +; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() { + %alloca = alloca [8 x i32], align 1, addrspace(5) + %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + %load0 = load i32, i32 addrspace(5)* %out, align 1 + %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1 + %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1 + %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8( +; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) +; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) +; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() { + %alloca = alloca [8 x i8], align 1, addrspace(5) + %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 + + %load0 = load i8, i8 addrspace(5)* %out, align 1 + %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1 + %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1 + %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} + +attributes #0 = { nounwind }