Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -446,8 +446,7 @@ // Broadcast subvector to vector. SUBV_BROADCAST, - // Insert/Extract vector element. - VINSERT, + // Extract vector element. VEXTRACT, /// SSE4A Extraction and Insertion. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -13776,24 +13776,36 @@ return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { - if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getScalarSizeInBits() == 32)) { - - MVT MaskEltVT = - MVT::getIntegerVT(VecVT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / - MaskEltVT.getSizeInBits()); - - Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, - DAG.getConstant(0, dl, PtrVT)); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, - DAG.getConstant(0, dl, PtrVT)); - } + // Its more profitable to go through memory (1 cycles throughput) + // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) + // IACA tool was used to get performace estimation + // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) + // + // exmample : extractelement <16 x i8> %a, i32 %i + // + // Block Throughput: 3.00 Cycles + // Throughput Bottleneck: Port5 + // + // | Num Of | Ports pressure in cycles | | + // | Uops | 0 - DV | 5 | 6 | 7 | | + // --------------------------------------------- + // | 1 | | 1.0 | | | CP | vmovd xmm1, edi + // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 + // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 + // Total Num Of Uops: 4 + // + // + // Block Throughput: 1.00 Cycles + // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 + // + // | | Ports pressure in cycles | | + // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | + // --------------------------------------------------------- + // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 + // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] + // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] + // Total Num Of Uops: 4 + return SDValue(); } @@ -23937,7 +23949,6 @@ case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -3580,19 +3580,6 @@ def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; } - -def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - -def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -453,9 +453,6 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; -def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, - SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -4702,19 +4702,6 @@ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt - -def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - -def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int // Index: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll +++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll @@ -124,16 +124,30 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind { ; KNL-LABEL: test7: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: pushq %rbp +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test7: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $15, %edi +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %e = extractelement <16 x float> %x, i32 %ind ret float %e @@ -142,18 +156,30 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind { ; KNL-LABEL: test8: ; KNL: ## BB#0: -; KNL-NEXT: movslq %edi, %rax -; KNL-NEXT: vmovq %rax, %xmm1 -; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: pushq %rbp +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test8: ; SKX: ## BB#0: -; SKX-NEXT: movslq %edi, %rax -; SKX-NEXT: vmovq %rax, %xmm1 -; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %e = extractelement <8 x double> %x, i32 %ind ret double %e @@ -162,16 +188,30 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind { ; KNL-LABEL: test9: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: pushq %rbp +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test9: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $64, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %ymm0, (%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %e = extractelement <8 x float> %x, i32 %ind ret float %e @@ -180,16 +220,30 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { ; KNL-LABEL: test10: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: pushq %rbp +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: movl (%rsp,%rdi,4), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test10: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: pushq %rbp +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $15, %edi +; SKX-NEXT: movl (%rsp,%rdi,4), %eax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %e = extractelement <16 x i32> %x, i32 %ind ret i32 %e @@ -1514,18 +1568,42 @@ define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v8i64: ; KNL: ## BB#0: -; KNL-NEXT: movslq %edi, %rax -; KNL-NEXT: vmovq %rax, %xmm1 -; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi6: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi7: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi8: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: movq (%rsp,%rdi,8), %rax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8i64: ; SKX: ## BB#0: -; SKX-NEXT: movslq %edi, %rax -; SKX-NEXT: vmovq %rax, %xmm1 -; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi3: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi4: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi5: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: movq (%rsp,%rdi,8), %rax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %t2 = extractelement <8 x i64> %t1, i32 %index ret i64 %t2 @@ -1555,12 +1633,12 @@ ; KNL-LABEL: test_extractelement_variable_v4f64: ; KNL: ## BB#0: ; KNL-NEXT: pushq %rbp -; KNL-NEXT: Lcfi6: +; KNL-NEXT: Lcfi9: ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: Lcfi7: +; KNL-NEXT: Lcfi10: ; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: Lcfi8: +; KNL-NEXT: Lcfi11: ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp @@ -1575,12 +1653,12 @@ ; SKX-LABEL: test_extractelement_variable_v4f64: ; SKX: ## BB#0: ; SKX-NEXT: pushq %rbp -; SKX-NEXT: Lcfi3: +; SKX-NEXT: Lcfi6: ; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: Lcfi4: +; SKX-NEXT: Lcfi7: ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: Lcfi5: +; SKX-NEXT: Lcfi8: ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp @@ -1598,18 +1676,42 @@ define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v8f64: ; KNL: ## BB#0: -; KNL-NEXT: movslq %edi, %rax -; KNL-NEXT: vmovq %rax, %xmm1 -; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi12: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi13: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi14: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8f64: ; SKX: ## BB#0: -; SKX-NEXT: movslq %edi, %rax -; SKX-NEXT: vmovq %rax, %xmm1 -; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi9: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi10: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi11: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %t2 = extractelement <8 x double> %t1, i32 %index ret double %t2 @@ -1638,16 +1740,42 @@ define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v8i32: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi15: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi16: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi17: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: movl (%rsp,%rdi,4), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8i32: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi12: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi13: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi14: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $64, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %ymm0, (%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: movl (%rsp,%rdi,4), %eax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %t2 = extractelement <8 x i32> %t1, i32 %index ret i32 %t2 @@ -1656,16 +1784,42 @@ define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v16i32: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi18: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi19: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi20: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: movl (%rsp,%rdi,4), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v16i32: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi15: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi16: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi17: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $15, %edi +; SKX-NEXT: movl (%rsp,%rdi,4), %eax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %t2 = extractelement <16 x i32> %t1, i32 %index ret i32 %t2 @@ -1694,16 +1848,42 @@ define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v8f32: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi21: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi22: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi23: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v8f32: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi18: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi19: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi20: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $64, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %ymm0, (%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %t2 = extractelement <8 x float> %t1, i32 %index ret float %t2 @@ -1712,16 +1892,42 @@ define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v16f32: ; KNL: ## BB#0: -; KNL-NEXT: vmovd %edi, %xmm1 -; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi24: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi25: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi26: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %zmm0, (%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_variable_v16f32: ; SKX: ## BB#0: -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi21: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi22: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi23: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %zmm0, (%rsp) +; SKX-NEXT: andl $15, %edi +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp ; SKX-NEXT: retq %t2 = extractelement <16 x float> %t1, i32 %index ret float %t2 @@ -1751,12 +1957,12 @@ ; KNL-LABEL: test_extractelement_variable_v16i16: ; KNL: ## BB#0: ; KNL-NEXT: pushq %rbp -; KNL-NEXT: Lcfi9: +; KNL-NEXT: Lcfi27: ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: Lcfi10: +; KNL-NEXT: Lcfi28: ; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: Lcfi11: +; KNL-NEXT: Lcfi29: ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp @@ -1771,12 +1977,12 @@ ; SKX-LABEL: test_extractelement_variable_v16i16: ; SKX: ## BB#0: ; SKX-NEXT: pushq %rbp -; SKX-NEXT: Lcfi6: +; SKX-NEXT: Lcfi24: ; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: Lcfi7: +; SKX-NEXT: Lcfi25: ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: Lcfi8: +; SKX-NEXT: Lcfi26: ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp @@ -1791,11 +1997,50 @@ ret i16 %t2 } -; TODO - enable after fix -;define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) { -; %t2 = extractelement <32 x i16> %t1, i32 %index -; ret i16 %t2 -;} +define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v32i16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi30: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi31: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi32: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $31, %edi +; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v32i16: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi27: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi28: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi29: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovdqu16 %zmm0, (%rsp) +; SKX-NEXT: andl $31, %edi +; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: retq + %t2 = extractelement <32 x i16> %t1, i32 %index + ret i16 %t2 +} define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { ; KNL-LABEL: test_extractelement_variable_v16i8: @@ -1823,12 +2068,12 @@ ; KNL-LABEL: test_extractelement_variable_v32i8: ; KNL: ## BB#0: ; KNL-NEXT: pushq %rbp -; KNL-NEXT: Lcfi12: +; KNL-NEXT: Lcfi33: ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: Lcfi13: +; KNL-NEXT: Lcfi34: ; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: Lcfi14: +; KNL-NEXT: Lcfi35: ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp @@ -1844,12 +2089,12 @@ ; SKX-LABEL: test_extractelement_variable_v32i8: ; SKX: ## BB#0: ; SKX-NEXT: pushq %rbp -; SKX-NEXT: Lcfi9: +; SKX-NEXT: Lcfi30: ; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: Lcfi10: +; SKX-NEXT: Lcfi31: ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: Lcfi11: +; SKX-NEXT: Lcfi32: ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp @@ -1866,8 +2111,101 @@ ret i8 %t2 } -; TODO - enable after fix -;define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { -; %t2 = extractelement <64 x i8> %t1, i32 %index -; ret i8 %t2 -;} +define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v64i8: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi36: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi37: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi38: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $63, %edi +; KNL-NEXT: movq %rsp, %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v64i8: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi33: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi34: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi35: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX-NEXT: andl $63, %edi +; SKX-NEXT: movq %rsp, %rax +; SKX-NEXT: movb (%rdi,%rax), %al +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: retq + + %t2 = extractelement <64 x i8> %t1, i32 %index + ret i8 %t2 +} + +define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) { +; KNL-LABEL: test_extractelement_variable_v64i8_indexi8: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi39: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi40: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi41: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: addb %dil, %dil +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: movzbl %dil, %eax +; KNL-NEXT: andl $63, %eax +; KNL-NEXT: movq %rsp, %rcx +; KNL-NEXT: movb (%rax,%rcx), %al +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v64i8_indexi8: +; SKX: ## BB#0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: Lcfi36: +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: Lcfi37: +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: Lcfi38: +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: addb %dil, %dil +; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX-NEXT: movzbl %dil, %eax +; SKX-NEXT: andl $63, %eax +; SKX-NEXT: movq %rsp, %rcx +; SKX-NEXT: movb (%rax,%rcx), %al +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: retq + + %i = add i8 %index, %index + %t2 = extractelement <64 x i8> %t1, i8 %i + ret i8 %t2 +} Index: llvm/trunk/test/CodeGen/X86/extractelement-index.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/extractelement-index.ll +++ llvm/trunk/test/CodeGen/X86/extractelement-index.ll @@ -538,27 +538,19 @@ ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX1-LABEL: extractelement_v8i32_var: -; AVX1: # BB#0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: movl (%rsp,%rdi,4), %eax -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: extractelement_v8i32_var: -; AVX2: # BB#0: -; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: extractelement_v8i32_var: +; AVX: # BB#0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: movq %rsp, %rbp +; AVX-NEXT: andq $-32, %rsp +; AVX-NEXT: subq $64, %rsp +; AVX-NEXT: andl $7, %edi +; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: movl (%rsp,%rdi,4), %eax +; AVX-NEXT: movq %rbp, %rsp +; AVX-NEXT: popq %rbp +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %b = extractelement <8 x i32> %a, i256 %i ret i32 %b } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -236,70 +236,43 @@ } define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { -; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: -; AVX1: # BB#0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: # kill: %R9D %R9D %R9 -; AVX1-NEXT: # kill: %R8D %R8D %R8 -; AVX1-NEXT: # kill: %ECX %ECX %RCX -; AVX1-NEXT: # kill: %EDX %EDX %RDX -; AVX1-NEXT: # kill: %ESI %ESI %RSI -; AVX1-NEXT: # kill: %EDI %EDI %RDI -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: andl $7, %esi -; AVX1-NEXT: andl $7, %edx -; AVX1-NEXT: andl $7, %ecx -; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: movl 16(%rbp), %r10d -; AVX1-NEXT: andl $7, %r10d -; AVX1-NEXT: movl 24(%rbp), %eax -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vmovd %esi, %xmm2 -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2 -; AVX2-NEXT: vmovd %edx, %xmm3 -; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4 -; AVX2-NEXT: vmovd %r8d, %xmm5 -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5 -; AVX2-NEXT: vmovd %r9d, %xmm6 -; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6 -; AVX2-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7 -; AVX2-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: +; ALL: # BB#0: +; ALL-NEXT: pushq %rbp +; ALL-NEXT: movq %rsp, %rbp +; ALL-NEXT: andq $-32, %rsp +; ALL-NEXT: subq $64, %rsp +; ALL-NEXT: # kill: %R9D %R9D %R9 +; ALL-NEXT: # kill: %R8D %R8D %R8 +; ALL-NEXT: # kill: %ECX %ECX %RCX +; ALL-NEXT: # kill: %EDX %EDX %RDX +; ALL-NEXT: # kill: %ESI %ESI %RSI +; ALL-NEXT: # kill: %EDI %EDI %RDI +; ALL-NEXT: andl $7, %edi +; ALL-NEXT: andl $7, %esi +; ALL-NEXT: andl $7, %edx +; ALL-NEXT: andl $7, %ecx +; ALL-NEXT: andl $7, %r8d +; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: andl $7, %r9d +; ALL-NEXT: movl 16(%rbp), %r10d +; ALL-NEXT: andl $7, %r10d +; ALL-NEXT: movl 24(%rbp), %eax +; ALL-NEXT: andl $7, %eax +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; ALL-NEXT: movq %rbp, %rsp +; ALL-NEXT: popq %rbp +; ALL-NEXT: retq %x0 = extractelement <8 x float> %x, i32 %i0 %x1 = extractelement <8 x float> %x, i32 %i1 %x2 = extractelement <8 x float> %x, i32 %i2