diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -150,7 +150,7 @@ // We support shuffle represents stride 4 for byte type with size of // WideInstSize. if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) - return true; + return true; if (ShuffleElemSize == 8 && isa(Inst) && Factor == 4 && (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || @@ -211,13 +211,20 @@ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); } // Generate N loads of T type. + assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() && + "VecBaseTy's size must be a multiple of 8"); + const Align FirstAlignment = LI->getAlign(); + const Align SubsequentAlignment = commonAlignment( + FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8); + Align Alignment = FirstAlignment; for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = - Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlign()); + Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment); DecomposedVectors.push_back(NewLoad); + Alignment = SubsequentAlignment; } } @@ -255,7 +262,7 @@ SmallVectorImpl &Out, int LowOffset, int HighOffset) { assert(VT.getSizeInBits() >= 256 && - "This function doesn't accept width smaller then 256"); + "This function doesn't accept width smaller then 256"); unsigned NumOfElm = VT.getVectorNumElements(); for (unsigned i = 0; i < Mask.size(); i++) Out.push_back(Mask[i] + LowOffset); @@ -289,7 +296,7 @@ if (VecElems == 16) { for (unsigned i = 0; i < Stride; i++) TransposedMatrix[i] = Builder.CreateShuffleVector( - Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); + Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); return; } @@ -298,20 +305,19 @@ for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, - (i + 1) / Stride * 16); + (i + 1) / Stride * 16); Temp[i / 2] = Builder.CreateShuffleVector( - Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); + Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); OptimizeShuf.clear(); } if (VecElems == 32) { std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); return; - } - else + } else for (unsigned i = 0; i < Stride; i++) TransposedMatrix[i] = - Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); + Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); } void X86InterleavedAccessGroup::interleave8bitStride4VF8( @@ -682,7 +688,7 @@ unsigned NumOfElm = VT.getVectorNumElements(); group2Shuffle(VT, GroupSize, VPShuf); - reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder); + reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder); } void X86InterleavedAccessGroup::transpose_4x4( diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll --- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll @@ -8,15 +8,15 @@ ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 ; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 128 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 128 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 128 +; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 16 ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3 -; AVX2-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 128 +; AVX2-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 16 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 128 +; AVX2-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 16 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5 -; AVX2-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 128 +; AVX2-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 16 ; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> ; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> ; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> @@ -50,9 +50,9 @@ ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 ; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 64 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 64 +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 16 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 64 +; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 16 ; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32>