Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -98,18 +98,25 @@ bool X86InterleavedAccessGroup::isSupported() const { VectorType *ShuffleVecTy = Shuffles[0]->getType(); - uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); + unsigned SupportedNumElem = 4; + unsigned WideInstSize; // Currently, lowering is supported for 4-element vectors of 64 bits on AVX. - uint64_t ExpectedShuffleVecSize; - if (isa(Inst)) - ExpectedShuffleVecSize = 256; - else - ExpectedShuffleVecSize = 1024; + if (ShuffleElemSize != 64) + return false; + + if (isa(Inst)) { + if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize) + return false; + + WideInstSize = DL.getTypeSizeInBits(Inst->getType()); + } else + WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); - if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize || - DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4) + if (!Subtarget.hasAVX() || Factor != 4 || + WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem)) return false; return true; @@ -137,8 +144,9 @@ for (unsigned i = 0; i < NumSubVectors; ++i) DecomposedVectors.push_back( cast(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Indices[i], - SubVecTy->getVectorNumElements(), 0)))); + Op0, Op1, + createSequentialMask(Builder, Indices[i], + SubVecTy->getVectorNumElements(), 0)))); return; } @@ -219,8 +227,8 @@ // Lower the interleaved stores: // 1. Decompose the interleaved wide shuffle into individual shuffle // vectors. - decompose(Shuffles[0], Factor, - VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors); + decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems), + DecomposedVectors); // 2. Transpose the interleaved-vectors into vectors of contiguous // elements. Index: test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll +++ test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll @@ -217,3 +217,20 @@ store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16 ret void } + +; This verifies whether the test passes and does not hit any assertions. +; Today, X86InterleavedAccess could have handled this case and +; generate transposed sequence by extending the current implementation +; which would be creating dummy vectors of undef. But it decided not to +; optimize these cases where the load-size is less than Factor * NumberOfElements. +; Because a better sequence can easily be generated by CG. + +@a = local_unnamed_addr global <4 x double> zeroinitializer, align 32 +; Function Attrs: norecurse nounwind readonly uwtable +define <4 x double> @test_unhandled(<4 x double> %b) { +entry: + %0 = load <4 x double>, <4 x double>* @a, align 32 + %1 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> + %shuffle = shufflevector <4 x double> %1, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +}