diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -150,7 +150,7 @@ // We support shuffle represents stride 4 for byte type with size of // WideInstSize. if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) - return true; + return true; if (ShuffleElemSize == 8 && isa(Inst) && Factor == 4 && (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || @@ -211,13 +211,19 @@ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); } // Generate N loads of T type. + assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized()); + const Align FirstAlignment = LI->getAlign(); + const Align SubsequentAlignment = commonAlignment( + FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize()); + Align Alignment = FirstAlignment; for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = - Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlign()); + Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment); DecomposedVectors.push_back(NewLoad); + Alignment = SubsequentAlignment; } } @@ -255,7 +261,7 @@ SmallVectorImpl &Out, int LowOffset, int HighOffset) { assert(VT.getSizeInBits() >= 256 && - "This function doesn't accept width smaller then 256"); + "This function doesn't accept width smaller then 256"); unsigned NumOfElm = VT.getVectorNumElements(); for (unsigned i = 0; i < Mask.size(); i++) Out.push_back(Mask[i] + LowOffset); @@ -289,7 +295,7 @@ if (VecElems == 16) { for (unsigned i = 0; i < Stride; i++) TransposedMatrix[i] = Builder.CreateShuffleVector( - Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); + Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); return; } @@ -298,20 +304,19 @@ for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, - (i + 1) / Stride * 16); + (i + 1) / Stride * 16); Temp[i / 2] = Builder.CreateShuffleVector( - Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); + Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); OptimizeShuf.clear(); } if (VecElems == 32) { std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); return; - } - else + } else for (unsigned i = 0; i < Stride; i++) TransposedMatrix[i] = - Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); + Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); } void X86InterleavedAccessGroup::interleave8bitStride4VF8( @@ -682,7 +687,7 @@ unsigned NumOfElm = VT.getVectorNumElements(); group2Shuffle(VT, GroupSize, VPShuf); - reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder); + reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder); } void X86InterleavedAccessGroup::transpose_4x4( diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll --- a/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll @@ -5,7 +5,7 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; CHECK-LABEL: @load_factorf64_4( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x double>* %ptr to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x double>* [[PTR:%.*]] to <4 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 16 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 1 @@ -40,7 +40,7 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; CHECK-LABEL: @load_factori64_4( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i64>* %ptr to <4 x i64>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i64>* [[PTR:%.*]] to <4 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 16 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <4 x i64>, <4 x i64>* [[TMP1]], i32 1 @@ -75,7 +75,7 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { ; CHECK-LABEL: @load_factorf64_1( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x double>* %ptr to <4 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x double>* [[PTR:%.*]] to <4 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 16 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP1]], i32 1 @@ -228,6 +228,13 @@ @a = local_unnamed_addr global <4 x double> zeroinitializer, align 32 ; Function Attrs: norecurse nounwind readonly uwtable define <4 x double> @test_unhandled(<4 x double> %b) { +; CHECK-LABEL: @test_unhandled( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* @a, align 32 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[B:%.*]], <4 x i32> +; CHECK-NEXT: ret <4 x double> [[SHUFFLE]] +; entry: %0 = load <4 x double>, <4 x double>* @a, align 32 %1 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll --- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedLoad.ll @@ -6,17 +6,17 @@ ; AVX2-LABEL: @interleaved_load_vf32_i8_stride3( ; AVX2-NEXT: [[TMP1:%.*]] = bitcast <96 x i8>* [[PTR:%.*]] to <16 x i8>* ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 128 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 128 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 128 ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 3 -; AVX2-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]] +; AVX2-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 128 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]] +; AVX2-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP10]], align 128 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 5 -; AVX2-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]] +; AVX2-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[TMP12]], align 128 ; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP9]], <32 x i32> ; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP11]], <32 x i32> ; AVX2-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP13]], <32 x i32> @@ -48,11 +48,11 @@ ; AVX2-LABEL: @interleaved_load_vf16_i8_stride3( ; AVX2-NEXT: [[TMP1:%.*]] = bitcast <48 x i8>* [[PTR:%.*]] to <16 x i8>* ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]] +; AVX2-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 64 ; AVX2-NEXT: [[TMP4:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 1 -; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 64 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[TMP1]], i32 2 -; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]] +; AVX2-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 64 ; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> undef, <16 x i32> ; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> undef, <16 x i32> @@ -79,7 +79,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ ; AVX2-LABEL: @interleaved_load_vf8_i8_stride3( -; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]] +; AVX2-NEXT: [[WIDE_VEC:%.*]] = load <24 x i8>, <24 x i8>* [[PTR:%.*]], align 32 ; AVX2-NEXT: [[V1:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> ; AVX2-NEXT: [[V2:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> ; AVX2-NEXT: [[V3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> undef, <8 x i32> diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll --- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll @@ -25,7 +25,7 @@ ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> [[TMP16]], <64 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i8> [[TMP17]], <64 x i8> [[TMP18]], <128 x i32> -; CHECK-NEXT: store <128 x i8> [[TMP19]], <128 x i8>* [[P:%.*]] +; CHECK-NEXT: store <128 x i8> [[TMP19]], <128 x i8>* [[P:%.*]], align 128 ; CHECK-NEXT: ret void ; %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> @@ -54,7 +54,7 @@ ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <32 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <32 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> [[TMP14]], <64 x i32> -; CHECK-NEXT: store <64 x i8> [[TMP15]], <64 x i8>* [[P:%.*]] +; CHECK-NEXT: store <64 x i8> [[TMP15]], <64 x i8>* [[P:%.*]], align 64 ; CHECK-NEXT: ret void ; %v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> @@ -77,7 +77,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> -; CHECK-NEXT: store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]] +; CHECK-NEXT: store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]], align 32 ; CHECK-NEXT: ret void ; %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> @@ -232,7 +232,7 @@ ; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <64 x i8> [[TMP23]], <64 x i8> [[TMP24]], <128 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> [[TMP26]], <128 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <128 x i8> [[TMP27]], <128 x i8> [[TMP28]], <256 x i32> -; CHECK-NEXT: store <256 x i8> [[TMP29]], <256 x i8>* [[P:%.*]] +; CHECK-NEXT: store <256 x i8> [[TMP29]], <256 x i8>* [[P:%.*]], align 256 ; CHECK-NEXT: ret void ; %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32>