Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -816,20 +816,22 @@ // Check alignment restrictions. unsigned Alignment = getAlignment(S0); - // If the store is going to be misaligned, don't vectorize it. - if (accessIsMisaligned(SzInBytes, AS, Alignment)) { - if (S0->getPointerAddressSpace() != 0) - return false; - + if (S0->getPointerAddressSpace() == 0 && + Alignment < StackAdjustedAlignment) { + // Even if the target supports unaligned access of the type, it still may be + // better to adjust the alignment. if (AllocaInst *AI = canAdjustAllocaAlignment(S0->getPointerOperand(), EltSzInBytes, Alignment)) { Alignment = StackAdjustedAlignment; if (AI->getAlignment() < Alignment) AI->setAlignment(Alignment); - } else - return false; + } } + // If the store is going to be misaligned, don't vectorize it. + if (accessIsMisaligned(SzInBytes, AS, Alignment)) + return false; + BasicBlock::iterator First, Last; std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*Last); @@ -965,20 +967,19 @@ // Check alignment restrictions. unsigned Alignment = getAlignment(L0); - // If the load is going to be misaligned, don't vectorize it. - if (accessIsMisaligned(SzInBytes, AS, Alignment)) { - if (L0->getPointerAddressSpace() != 0) - return false; - + if (L0->getPointerAddressSpace() == 0 && Alignment < StackAdjustedAlignment) { if (AllocaInst *AI = canAdjustAllocaAlignment(L0->getPointerOperand(), EltSzInBytes, Alignment)) { Alignment = StackAdjustedAlignment; if (AI->getAlignment() < Alignment) AI->setAlignment(Alignment); - } else - return false; + } } + // If the load is going to be misaligned, don't vectorize it. + if (accessIsMisaligned(SzInBytes, AS, Alignment)) + return false; + DEBUG({ dbgs() << "LSV: Loads to vectorize:\n"; for (Instruction *I : Chain) Index: test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -21,6 +21,23 @@ ret void } +; ALL-LABEL: @load_alloca16_unknown_offset_align1_i8( +; ALL: alloca [128 x i8], align 16 +; UNALIGNED: load <2 x i8>, <2 x i8>* %{{[0-9]+}}, align 1{{$}} + +; ALIGNED: load i8, i8* %ptr0, align 1{{$}} +; ALIGNED: load i8, i8* %ptr1, align 1{{$}} +define void @load_alloca16_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i8], align 16 + %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset + %val0 = load i8, i8* %ptr0, align 1 + %ptr1 = getelementptr inbounds i8, i8* %ptr0, i32 1 + %val1 = load i8, i8* %ptr1, align 1 + %add = add i8 %val0, %val1 + store i8 %add, i8 addrspace(1)* %out + ret void +} + ; ALL-LABEL: @load_unknown_offset_align1_i16( ; ALL: alloca [128 x i16], align 1{{$}} ; UNALIGNED: load <2 x i16>, <2 x i16>* %{{[0-9]+}}, align 1{{$}} @@ -40,11 +57,8 @@ ; Although the offset is unknown here, we know it is a multiple of the element size. ; ALL-LABEL: @load_unknown_offset_align1_i32( -; UNALIGNED: alloca [128 x i32], align 1 -; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: alloca [128 x i32], align 4 -; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}} +; ALL: alloca [128 x i32], align 4 +; ALL: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}} define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 1 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset @@ -56,13 +70,10 @@ ret void } -; FIXME: Should always increase alignment of the load ; Make sure alloca alignment isn't decreased ; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32( ; ALL: alloca [128 x i32], align 16 - -; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}} +; ALL: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}} define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 16 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset @@ -106,11 +117,8 @@ ; Although the offset is unknown here, we know it is a multiple of the element size. ; ALL-LABEL: @store_unknown_offset_align1_i32( -; UNALIGNED: alloca [128 x i32], align 1 -; UNALIGNED: store <2 x i32> , <2 x i32>* %{{[0-9]+}}, align 1{{$}} - -; ALIGNED: alloca [128 x i32], align 4 -; ALIGNED: store <2 x i32> , <2 x i32>* %{{[0-9]+}}, align 4{{$}} +; ALL: alloca [128 x i32], align 4 +; ALL: store <2 x i32> , <2 x i32>* %{{[0-9]+}}, align 4{{$}} define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 1 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset @@ -120,5 +128,17 @@ ret void } -attributes #0 = { nounwind } +; Make sure the alignment of the alloca isn't decreased +; ALL-LABEL: @store_alloca16_unknown_offset_align1_i32( +; ALL: alloca [128 x i32], align 16 +; ALL: store <2 x i32> , <2 x i32>* %{{[0-9]+}}, align 4{{$}} +define void @store_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { + %alloca = alloca [128 x i32], align 16 + %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset + store i32 9, i32* %ptr0, align 1 + %ptr1 = getelementptr inbounds i32, i32* %ptr0, i32 1 + store i32 10, i32* %ptr1, align 1 + ret void +} +attributes #0 = { nounwind }