diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5145,17 +5145,19 @@ // TLI: Target lowering used to determine legal types. // Width: Width left need to load/store. // WidenVT: The widen vector type to load to/store from -// Align: If 0, don't allow use of a wider type -// WidenEx: If Align is not 0, the amount additional we can load/store from. +// NumDereferenceableBytes: If 0, don't allow use of a wider type +// WidenEx: If NumDereferenceableBytes is not 0, +// the additional amount we have to load/store. -static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, +static EVT FindMemType(SelectionDAG &DAG, const TargetLowering &TLI, unsigned Width, EVT WidenVT, - unsigned Align = 0, unsigned WidenEx = 0) { + unsigned NumDereferenceableBytes = 0, + unsigned WidenEx = 0) { EVT WidenEltVT = WidenVT.getVectorElementType(); const bool Scalable = WidenVT.isScalableVector(); unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize(); unsigned WidenEltWidth = WidenEltVT.getSizeInBits(); - unsigned AlignInBits = Align*8; + unsigned NumDereferenceableBits = NumDereferenceableBytes * 8; // If we have one element to load/store, return it. EVT RetVT = WidenEltVT; @@ -5178,8 +5180,9 @@ Action == TargetLowering::TypePromoteInteger) && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (MemVTWidth == WidenWidth) return MemVT; RetVT = MemVT; @@ -5203,8 +5206,9 @@ WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT) return MemVT; } @@ -5270,13 +5274,24 @@ TypeSize LdWidth = LdVT.getSizeInBits(); TypeSize WidenWidth = WidenVT.getSizeInBits(); TypeSize WidthDiff = WidenWidth - LdWidth; - // Allow wider loads if they are sufficiently aligned to avoid memory faults - // and if the original load is simple. - unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment(); + unsigned NumDereferenceableBytes = 0; + // Allow wider loads if the original load is simple and we can dereference + // padding bytes. + if (LD->isSimple()) { + NumDereferenceableBytes = LD->getAlignment(); + if (!LdWidth.isScalable()) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, LdWidth / 8); + if (!WidenWidth.isScalable() && NumDereferenceableBytes < WidenWidth / 8 && + LD->getPointerInfo().isDereferenceable( + WidenWidth / 8, *DAG.getContext(), DAG.getDataLayout())) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, WidenWidth / 8); + } // Find the vector type that can load from. - EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); TypeSize NewVTWidth = NewVT.getSizeInBits(); SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(), LD->getOriginalAlign(), MMOFlags, AAInfo); @@ -5312,13 +5327,21 @@ MachinePointerInfo MPI = LD->getPointerInfo(); do { LdWidth -= NewVTWidth; + if (!NewVTWidth.isScalable()) { + if (NumDereferenceableBytes > NewVTWidth / 8) + NumDereferenceableBytes -= NewVTWidth / 8; + else + NumDereferenceableBytes = 0; + } else + NumDereferenceableBytes = 0; // FIXME + IncrementPointer(cast(LdOp), NewVT, MPI, BasePtr, &ScaledOffset); if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) { // The current type we are using is too large. Find a better size. - NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); NewVTWidth = NewVT.getSizeInBits(); } diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -383,7 +383,7 @@ ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 -; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 @@ -454,12 +454,12 @@ ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 -; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 -; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { entry: store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 @@ -479,12 +479,12 @@ ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 -; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 -; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { entry: store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -470,12 +470,19 @@ define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) { ; CHECK-LABEL: vld3_v2i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrd r2, r0, [r0] -; CHECK-NEXT: strd r2, r0, [sp] -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: add r4, sp, #8 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: str r0, [sp, #8] +; CHECK-NEXT: vldrh.u32 q0, [r4] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vstrh.32 q0, [r3] +; CHECK-NEXT: vldrb.u16 q0, [r3] ; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: add r0, r2 @@ -488,8 +495,8 @@ ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strb r0, [r1] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <6 x i8>, <6 x i8>* %src, align 4 %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll --- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll +++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll @@ -130,14 +130,8 @@ define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) { ; SSE2-LABEL: dot3_float3: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movups (%rdi), %xmm0 +; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] @@ -148,14 +142,8 @@ ; ; SSSE3-LABEL: dot3_float3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSSE3-NEXT: movups (%rdi), %xmm0 +; SSSE3-NEXT: movups (%rsi), %xmm1 ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSSE3-NEXT: addss %xmm1, %xmm0 @@ -165,10 +153,8 @@ ; ; SSE41-LABEL: dot3_float3: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; SSE41-NEXT: movups (%rdi), %xmm0 +; SSE41-NEXT: movups (%rsi), %xmm1 ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -178,11 +164,8 @@ ; ; AVX-LABEL: dot3_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -11,15 +11,14 @@ ; CHECK-LABEL: load23: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups 64(%rsi), %ymm0 -; CHECK-NEXT: vmovups (%rsi), %zmm1 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss %xmm3, 88(%rdi) -; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) +; CHECK-NEXT: vmovups (%rsi), %zmm0 +; CHECK-NEXT: vmovaps 64(%rsi), %xmm1 +; CHECK-NEXT: movq 80(%rsi), %rcx +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss %xmm2, 88(%rdi) +; CHECK-NEXT: movq %rcx, 80(%rdi) +; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, <23 x float>* %p, align 16 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -64,26 +64,26 @@ define void @vf4(<20 x i16>* %in.vec, <4 x i16>* %out.vec0, <4 x i16>* %out.vec1, <4 x i16>* %out.vec2, <4 x i16>* %out.vec3, <4 x i16>* %out.vec4) nounwind { ; AVX2-SLOW-LABEL: vf4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovq %xmm3, (%rsi) ; AVX2-SLOW-NEXT: vmovq %xmm4, (%rdx) ; AVX2-SLOW-NEXT: vmovq %xmm5, (%rcx) @@ -93,24 +93,24 @@ ; ; AVX2-FAST-LABEL: vf4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX2-FAST-NEXT: vmovq %xmm4, (%rdx) ; AVX2-FAST-NEXT: vmovq %xmm5, (%rcx) @@ -137,44 +137,44 @@ define void @vf8(<40 x i16>* %in.vec, <8 x i16>* %out.vec0, <8 x i16>* %out.vec1, <8 x i16>* %out.vec2, <8 x i16>* %out.vec3, <8 x i16>* %out.vec4) nounwind { ; AVX2-SLOW-LABEL: vf8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] -; AVX2-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpsllq $48, %xmm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] +; AVX2-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] +; AVX2-SLOW-NEXT: vpsllq $48, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,0] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r9) @@ -183,41 +183,41 @@ ; ; AVX2-FAST-LABEL: vf8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] -; AVX2-FAST-NEXT: vpbroadcastw 70(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpsllq $48, %xmm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] +; AVX2-FAST-NEXT: vpbroadcastw 70(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] +; AVX2-FAST-NEXT: vpsllq $48, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%r8) ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9) diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll --- a/llvm/test/CodeGen/X86/widen_load-2.ll +++ b/llvm/test/CodeGen/X86/widen_load-2.ll @@ -77,26 +77,34 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqa (%edx), %xmm0 -; X86-NEXT: movdqa 16(%edx), %xmm1 -; X86-NEXT: paddd (%ecx), %xmm0 -; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: movdqa %xmm0, (%eax) +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 20(%edx), %xmm0 +; X86-NEXT: pinsrd $2, 24(%edx), %xmm0 +; X86-NEXT: movdqa (%edx), %xmm1 +; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 20(%ecx), %xmm2 +; X86-NEXT: pinsrd $2, 24(%ecx), %xmm2 +; X86-NEXT: paddd %xmm0, %xmm2 +; X86-NEXT: paddd (%ecx), %xmm1 +; X86-NEXT: movdqa %xmm1, (%eax) +; X86-NEXT: movd %xmm2, 16(%eax) +; X86-NEXT: pextrd $1, %xmm2, 20(%eax) +; X86-NEXT: pextrd $2, %xmm2, 24(%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add7i32: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: movdqa 16(%rsi), %xmm1 -; X64-NEXT: paddd (%rdx), %xmm0 -; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) -; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pinsrd $2, 24(%rsi), %xmm0 +; X64-NEXT: movdqa (%rsi), %xmm1 +; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-NEXT: pinsrd $2, 24(%rdx), %xmm2 +; X64-NEXT: paddd %xmm0, %xmm2 +; X64-NEXT: paddd (%rdx), %xmm1 +; X64-NEXT: movdqa %xmm1, (%rdi) +; X64-NEXT: movq %xmm2, 16(%rdi) +; X64-NEXT: pextrd $2, %xmm2, 24(%rdi) ; X64-NEXT: retq %a = load %i32vec7, %i32vec7* %ap, align 16 %b = load %i32vec7, %i32vec7* %bp, align 16 @@ -211,24 +219,28 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqa (%edx), %xmm0 -; X86-NEXT: movdqa 16(%edx), %xmm1 -; X86-NEXT: paddw (%ecx), %xmm0 -; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: movdqa %xmm0, (%eax) +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 20(%edx), %xmm0 +; X86-NEXT: movdqa (%edx), %xmm1 +; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 20(%ecx), %xmm2 +; X86-NEXT: paddw %xmm0, %xmm2 +; X86-NEXT: paddw (%ecx), %xmm1 +; X86-NEXT: movdqa %xmm1, (%eax) +; X86-NEXT: movd %xmm2, 16(%eax) +; X86-NEXT: pextrd $1, %xmm2, 20(%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add12i16: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: movdqa 16(%rsi), %xmm1 -; X64-NEXT: paddw (%rdx), %xmm0 -; X64-NEXT: paddw 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) -; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movdqa (%rsi), %xmm1 +; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-NEXT: paddw %xmm0, %xmm2 +; X64-NEXT: paddw (%rdx), %xmm1 +; X64-NEXT: movq %xmm2, 16(%rdi) +; X64-NEXT: movdqa %xmm1, (%rdi) ; X64-NEXT: retq %a = load %i16vec12, %i16vec12* %ap, align 16 %b = load %i16vec12, %i16vec12* %bp, align 16 @@ -244,29 +256,31 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqa 32(%edx), %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 +; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: paddw %xmm0, %xmm3 ; X86-NEXT: paddw (%ecx), %xmm1 -; X86-NEXT: paddw 32(%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm2 ; X86-NEXT: movdqa %xmm2, 16(%eax) -; X86-NEXT: movd %xmm0, 32(%eax) +; X86-NEXT: movd %xmm3, 32(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add18i16: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: movdqa 16(%rsi), %xmm1 -; X64-NEXT: movdqa 32(%rsi), %xmm2 -; X64-NEXT: paddw (%rdx), %xmm0 -; X64-NEXT: paddw 32(%rdx), %xmm2 -; X64-NEXT: paddw 16(%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, 16(%rdi) -; X64-NEXT: movd %xmm2, 32(%rdi) -; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movdqa (%rsi), %xmm1 +; X64-NEXT: movdqa 16(%rsi), %xmm2 +; X64-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X64-NEXT: paddw %xmm0, %xmm3 +; X64-NEXT: paddw (%rdx), %xmm1 +; X64-NEXT: paddw 16(%rdx), %xmm2 +; X64-NEXT: movdqa %xmm2, 16(%rdi) +; X64-NEXT: movd %xmm3, 32(%rdi) +; X64-NEXT: movdqa %xmm1, (%rdi) ; X64-NEXT: retq %a = load %i16vec18, %i16vec18* %ap, align 16 %b = load %i16vec18, %i16vec18* %bp, align 16 @@ -313,30 +327,46 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqa (%edx), %xmm0 -; X86-NEXT: movdqa 16(%edx), %xmm1 -; X86-NEXT: paddb (%ecx), %xmm0 -; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrw $6, %xmm1, 28(%eax) -; X86-NEXT: pextrb $14, %xmm1, 30(%eax) -; X86-NEXT: movdqa %xmm0, (%eax) +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 20(%edx), %xmm0 +; X86-NEXT: pinsrd $2, 24(%edx), %xmm0 +; X86-NEXT: pinsrw $6, 28(%edx), %xmm0 +; X86-NEXT: pinsrb $14, 30(%edx), %xmm0 +; X86-NEXT: movdqa (%edx), %xmm1 +; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: pinsrd $1, 20(%ecx), %xmm2 +; X86-NEXT: pinsrd $2, 24(%ecx), %xmm2 +; X86-NEXT: pinsrw $6, 28(%ecx), %xmm2 +; X86-NEXT: pinsrb $14, 30(%ecx), %xmm2 +; X86-NEXT: paddb %xmm0, %xmm2 +; X86-NEXT: paddb (%ecx), %xmm1 +; X86-NEXT: movdqa %xmm1, (%eax) +; X86-NEXT: movd %xmm2, 16(%eax) +; X86-NEXT: pextrd $1, %xmm2, 20(%eax) +; X86-NEXT: pextrd $2, %xmm2, 24(%eax) +; X86-NEXT: pextrw $6, %xmm2, 28(%eax) +; X86-NEXT: pextrb $14, %xmm2, 30(%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add31i8: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: movdqa 16(%rsi), %xmm1 -; X64-NEXT: paddb (%rdx), %xmm0 -; X64-NEXT: paddb 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) -; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) -; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) -; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pinsrd $2, 24(%rsi), %xmm0 +; X64-NEXT: pinsrw $6, 28(%rsi), %xmm0 +; X64-NEXT: pinsrb $14, 30(%rsi), %xmm0 +; X64-NEXT: movdqa (%rsi), %xmm1 +; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-NEXT: pinsrd $2, 24(%rdx), %xmm2 +; X64-NEXT: pinsrw $6, 28(%rdx), %xmm2 +; X64-NEXT: pinsrb $14, 30(%rdx), %xmm2 +; X64-NEXT: paddb %xmm0, %xmm2 +; X64-NEXT: paddb (%rdx), %xmm1 +; X64-NEXT: movdqa %xmm1, (%rdi) +; X64-NEXT: movq %xmm2, 16(%rdi) +; X64-NEXT: pextrd $2, %xmm2, 24(%rdi) +; X64-NEXT: pextrw $6, %xmm2, 28(%rdi) +; X64-NEXT: pextrb $14, %xmm2, 30(%rdi) ; X64-NEXT: retq %a = load %i8vec31, %i8vec31* %ap, align 16 %b = load %i8vec31, %i8vec31* %bp, align 16