diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -724,30 +724,34 @@ auto *ShuffleTy = cast(Shuffles[0]->getType()); if (isa(Inst)) { - // Try to generate target-sized register(/instruction). - decompose(Inst, Factor, ShuffleTy, DecomposedVectors); - auto *ShuffleEltTy = cast(Inst->getType()); unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor; - // Perform matrix-transposition in order to compute interleaved - // results by generating some sort of (optimized) target-specific - // instructions. - switch (NumSubVecElems) { default: return false; case 4: - transpose_4x4(DecomposedVectors, TransposedVectors); - break; case 8: case 16: case 32: case 64: - deinterleave8bitStride3(DecomposedVectors, TransposedVectors, - NumSubVecElems); + if (ShuffleTy->getNumElements() != NumSubVecElems) + return false; break; } + // Try to generate target-sized register(/instruction). + decompose(Inst, Factor, ShuffleTy, DecomposedVectors); + + // Perform matrix-transposition in order to compute interleaved + // results by generating some sort of (optimized) target-specific + // instructions. + + if (NumSubVecElems == 4) + transpose_4x4(DecomposedVectors, TransposedVectors); + else + deinterleave8bitStride3(DecomposedVectors, TransposedVectors, + NumSubVecElems); + // Now replace the unoptimized-interleaved-vectors with the // transposed-interleaved vectors. for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1930,3 +1930,22 @@ store <16 x i64> %r, <16 x i64>* %d, align 8 ret void } + +define <2 x i64> @PR37616(<16 x i64>* %a0) { +; AVX1-LABEL: PR37616: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: PR37616: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0 +; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2OR512-NEXT: vzeroupper +; AVX2OR512-NEXT: retq + %load = load <16 x i64>, <16 x i64>* %a0, align 128 + %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> + ret <2 x i64> %shuffle +}