Index: lib/Target/X86/X86InterleavedAccess.cpp =================================================================== --- lib/Target/X86/X86InterleavedAccess.cpp +++ lib/Target/X86/X86InterleavedAccess.cpp @@ -103,9 +103,14 @@ // Currently, lowering is supported for 4-element vectors of 64 bits on AVX. uint64_t ExpectedShuffleVecSize; - if (isa(Inst)) + if (isa(Inst)) { + // If load size is less than Factor * ShuffleVecSize, transpose will not be + // not be profitable. + if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize) + return false; + ExpectedShuffleVecSize = 256; - else + } else ExpectedShuffleVecSize = 1024; if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize || Index: test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll =================================================================== --- test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll +++ test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll @@ -217,3 +217,14 @@ store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16 ret void } + +; This verifies whether the test passes. +@a = local_unnamed_addr global <4 x double> zeroinitializer, align 32 +; Function Attrs: norecurse nounwind readonly uwtable +define <4 x double> @test_unprofitable(<4 x double> %b) { +entry: + %0 = load <4 x double>, <4 x double>* @a, align 32 + %1 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> + %shuffle = shufflevector <4 x double> %1, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +}