diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -733,6 +733,9 @@ // results by generating some sort of (optimized) target-specific // instructions. + if (ShuffleTy->getNumElements() != NumSubVecElems) + return false; + switch (NumSubVecElems) { default: return false; diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1930,3 +1930,22 @@ store <16 x i64> %r, <16 x i64>* %d, align 8 ret void } + +define <2 x i64> @PR37616(<16 x i64>* %a0) { +; AVX1-LABEL: PR37616: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: PR37616: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0 +; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2OR512-NEXT: vzeroupper +; AVX2OR512-NEXT: retq + %load = load <16 x i64>, <16 x i64>* %a0, align 128 + %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> + ret <2 x i64> %shuffle +}