Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7832,8 +7832,6 @@ // TODO: Handle undefs // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. -// TODO: Use smaller-element vectors of same width, and "interpolate" the -// indices, when no native operation available. static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -7847,11 +7845,22 @@ if (Subtarget.hasSSE3()) Opcode = X86ISD::PSHUFB; break; + case MVT::v8i16: + if (Subtarget.hasVLX() && Subtarget.hasBWI()) + Opcode = X86ISD::VPERMV; + else if (Subtarget.hasSSE3()) { + Opcode = X86ISD::PSHUFB; + ShuffleVT = MVT::v16i8; + } + break; case MVT::v4f32: case MVT::v4i32: if (Subtarget.hasAVX()) { Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v4f32; + } else if (Subtarget.hasSSE3()) { + Opcode = X86ISD::PSHUFB; + ShuffleVT = MVT::v16i8; } break; case MVT::v2f64: @@ -7870,6 +7879,10 @@ case MVT::v4f64: if (Subtarget.hasVLX()) Opcode = X86ISD::VPERMV; + else if (Subtarget.hasAVX2()) { + Opcode = X86ISD::VPERMV; + ShuffleVT = MVT::v8f32; + } break; case MVT::v16f32: case MVT::v8f64: @@ -7882,7 +7895,6 @@ if (Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; break; - case MVT::v8i16: case MVT::v16i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; @@ -7941,8 +7953,8 @@ unsigned Opcode = LegalPermuteOpcode(VT, ShuffleVT); if (!Opcode) return SDValue(); - assert(VT.getScalarSizeInBits() == ShuffleVT.getScalarSizeInBits() && - VT.getVectorNumElements() == ShuffleVT.getVectorNumElements() && + assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && + (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && "Illegal variable permute shuffle type"); unsigned NumElts = VT.getVectorNumElements(); @@ -7964,6 +7976,29 @@ SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec))); } + unsigned Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); + if (Scale > 1) { + assert(isPowerOf2_32(Scale) && "Illegal variable permute shuffle scale"); + unsigned ShuffleBits = ShuffleVT.getScalarSizeInBits(); + uint64_t IndexScale = 0; + uint64_t IndexOffset = 0; + + for (uint64_t i = 0; i != Scale; ++i) { + IndexScale += 1ull << (i * ShuffleBits); + IndexOffset += i << (i * ShuffleBits); + } + IndexScale *= Scale; + + SDLoc DL(IndicesVec); + IndicesVec = DAG.getNode(ISD::MUL, DL, IndicesVT, IndicesVec, + DAG.getConstant(IndexScale, DL, IndicesVT)); + IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, + DAG.getConstant(IndexOffset, DL, IndicesVT)); + } + + EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); + IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); + SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); SDValue Res = Opcode == X86ISD::VPERMV Index: test/CodeGen/X86/var-permute-128.ll =================================================================== --- test/CodeGen/X86/var-permute-128.ll +++ test/CodeGen/X86/var-permute-128.ll @@ -37,25 +37,15 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { ; SSSE3-LABEL: var_shuffle_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %edx -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andl $3, %eax -; SSSE3-NEXT: andl $3, %ecx -; SSSE3-NEXT: andl $3, %edx -; SSSE3-NEXT: andl $3, %esi -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: var_shuffle_v4i32: @@ -80,76 +70,16 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { ; SSSE3-LABEL: var_shuffle_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd %xmm1, %r8d -; SSSE3-NEXT: pextrw $1, %xmm1, %r9d -; SSSE3-NEXT: pextrw $2, %xmm1, %r10d -; SSSE3-NEXT: pextrw $3, %xmm1, %esi -; SSSE3-NEXT: pextrw $4, %xmm1, %edi -; SSSE3-NEXT: pextrw $5, %xmm1, %eax -; SSSE3-NEXT: pextrw $6, %xmm1, %ecx -; SSSE3-NEXT: pextrw $7, %xmm1, %edx -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andl $7, %r8d -; SSSE3-NEXT: andl $7, %r9d -; SSSE3-NEXT: andl $7, %r10d -; SSSE3-NEXT: andl $7, %esi -; SSSE3-NEXT: andl $7, %edi -; SSSE3-NEXT: andl $7, %eax -; SSSE3-NEXT: andl $7, %ecx -; SSSE3-NEXT: andl $7, %edx -; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; AVXNOVLBW-LABEL: var_shuffle_v8i16: ; AVXNOVLBW: # %bb.0: -; AVXNOVLBW-NEXT: vmovd %xmm1, %eax -; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d -; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx -; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx -; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi -; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi -; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d -; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d -; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVXNOVLBW-NEXT: andl $7, %eax -; AVXNOVLBW-NEXT: andl $7, %r10d -; AVXNOVLBW-NEXT: andl $7, %ecx -; AVXNOVLBW-NEXT: andl $7, %edx -; AVXNOVLBW-NEXT: andl $7, %esi -; AVXNOVLBW-NEXT: andl $7, %edi -; AVXNOVLBW-NEXT: andl $7, %r8d -; AVXNOVLBW-NEXT: andl $7, %r9d -; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax -; AVXNOVLBW-NEXT: vmovd %eax, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVXNOVLBW-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 +; AVXNOVLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVXNOVLBW-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v8i16: @@ -273,25 +203,15 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind { ; SSSE3-LABEL: var_shuffle_v4f32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; SSSE3-NEXT: movd %xmm2, %ecx -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %edx -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSSE3-NEXT: movd %xmm1, %esi -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andl $3, %eax -; SSSE3-NEXT: andl $3, %ecx -; SSSE3-NEXT: andl $3, %edx -; SSSE3-NEXT: andl $3, %esi -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pmuludq %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: var_shuffle_v4f32: Index: test/CodeGen/X86/var-permute-256.ll =================================================================== --- test/CodeGen/X86/var-permute-256.ll +++ test/CodeGen/X86/var-permute-256.ll @@ -36,56 +36,34 @@ ; ; AVX2-LABEL: var_shuffle_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $64, %rsp -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: andl $3, %eax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbp -; AVX512F-NEXT: movq %rsp, %rbp -; AVX512F-NEXT: andq $-32, %rsp -; AVX512F-NEXT: subq $64, %rsp -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: andl $3, %eax -; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx -; AVX512F-NEXT: andl $3, %ecx -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %rdx -; AVX512F-NEXT: andl $3, %edx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512F-NEXT: andl $3, %esi -; AVX512F-NEXT: vmovaps %ymm0, (%rsp) -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: movq %rbp, %rsp -; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4i64: @@ -1129,52 +1107,34 @@ ; ; AVX2-LABEL: var_shuffle_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $64, %rsp -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: andl $3, %eax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbp -; AVX512F-NEXT: movq %rsp, %rbp -; AVX512F-NEXT: andq $-32, %rsp -; AVX512F-NEXT: subq $64, %rsp -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: andl $3, %eax -; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx -; AVX512F-NEXT: andl $3, %ecx -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %rdx -; AVX512F-NEXT: andl $3, %edx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512F-NEXT: andl $3, %esi -; AVX512F-NEXT: vmovaps %ymm0, (%rsp) -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: movq %rbp, %rsp -; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4f64: @@ -1298,44 +1258,36 @@ ; ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: andl $1, %esi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: andl $1, %eax -; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx -; AVX512F-NEXT: andl $1, %ecx -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %rdx -; AVX512F-NEXT: andl $1, %edx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64: @@ -2325,40 +2277,36 @@ ; ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: vpextrq $1, %xmm1, %rsi -; AVX2-NEXT: andl $1, %esi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: andl $1, %eax -; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx -; AVX512F-NEXT: andl $1, %ecx -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %rdx -; AVX512F-NEXT: andl $1, %edx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594] +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296] +; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64: