Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -446,8 +446,7 @@ // Broadcast subvector to vector. SUBV_BROADCAST, - // Insert/Extract vector element. - VINSERT, + // Extract vector element. VEXTRACT, /// SSE4A Extraction and Insertion. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13623,9 +13623,12 @@ "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, - // extend vector to VR512 + // extend vector to VR512/256/128 if (!isa(Idx)) { - MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + unsigned NumElts = VecVT.getVectorNumElements(); + // extend to 256/512 if possible, so VPERMV could be used. + unsigned VecSize = (NumElts == 2 ? 128 : (NumElts == 4 ? 256 : 512)); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); @@ -13652,6 +13655,24 @@ DAG.getIntPtrConstant(0, dl)); } +bool isVPERMVsupported(MVT VT, const X86Subtarget &Subtarget) { + if ((Subtarget.hasAVX2() && (VT == MVT::v8f32 || VT == MVT::v8i32)) || + (Subtarget.hasAVX512() && (VT == MVT::v16f32 || VT == MVT::v16i32))) + return true; // VMERMD/PS + if ((Subtarget.hasAVX512() && (VT == MVT::v8f64 || VT == MVT::v8i64)) || + (Subtarget.hasVLX() && (VT == MVT::v4f64 || VT == MVT::v4i64))) + return true; // VPERMQ/PD + if ((Subtarget.hasBWI() && VT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i16)) + return true; // VMERMW + if ((Subtarget.hasVBMI() && VT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && + (VT == MVT::v32i8 || VT == MVT::v16i8))) + return true; // VPERMB + + return false; +} + SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { @@ -13664,20 +13685,21 @@ return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { - if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getScalarSizeInBits() == 32)) { + if (isVPERMVsupported(VecVT, Subtarget)) { MVT MaskEltVT = MVT::getIntegerVT(VecVT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / - MaskEltVT.getSizeInBits()); + unsigned NumElts = VecVT.getSizeInBits() / MaskEltVT.getSizeInBits(); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, NumElts); Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, - DAG.getConstant(0, dl, PtrVT)); + // create BUILD_VECTOR, it will be matched as movd/movq/movss/movsd. + SmallVector Ops; + Ops.append(NumElts, DAG.getConstant(0, dl, MaskEltVT)); + Ops[0] = Idx; + SDValue Mask = DAG.getBuildVector(MaskVT, dl, Ops); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, DAG.getConstant(0, dl, PtrVT)); @@ -23832,7 +23854,6 @@ case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; @@ -26955,15 +26976,7 @@ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros && - ((Subtarget.hasAVX2() && - (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || - (Subtarget.hasAVX512() && - (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || - MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || - (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || - (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || - (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { + isVPERMVsupported(MaskVT, Subtarget)) { MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3579,19 +3579,6 @@ def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; } - -def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - -def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -437,9 +437,6 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; -def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, - SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4706,19 +4706,6 @@ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt - -def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - -def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int // Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=cannonlake | FileCheck --check-prefix=SKX --check-prefix=CNL %s define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; KNL-LABEL: test1: @@ -1446,3 +1447,659 @@ %res = select i1 %t2, i8 3, i8 4 ret i8 %res } + +define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v2i1: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movl -24(%rsp,%rdi,8), %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v2i1: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 +; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} +; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: movl -24(%rsp,%rdi,8), %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <2 x i64> %a, %b + %t2 = extractelement <2 x i1> %t1, i32 %index + %res = zext i1 %t2 to i8 + ret i8 %res +} + +define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v4i1: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $3, %edi +; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v4i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k1 +; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k1} {z} +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: ## kill: %EAX %EAX %RAX +; SKX-NEXT: retq + %t1 = icmp ugt <4 x i32> %a, %b + %t2 = extractelement <4 x i1> %t1, i32 %index + %res = zext i1 %t2 to i8 + ret i8 %res +} + +define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v8i1: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: ## kill: %EAX %EAX %RAX +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v8i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 +; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: ## kill: %EAX %EAX %RAX +; SKX-NEXT: retq + %t1 = icmp ugt <8 x i32> %a, %b + %t2 = extractelement <8 x i1> %t1, i32 %index + %res = zext i1 %t2 to i8 + ret i8 %res +} + +define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v16i1: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v16i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <16 x i32> %a, %b + %t2 = extractelement <16 x i1> %t1, i32 %index + %res = zext i1 %t2 to i8 + ret i8 %res +} + +define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v32i1: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi3: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi4: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi5: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, (%rsp) +; KNL-NEXT: andl $31, %edi +; KNL-NEXT: movq %rsp, %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v32i1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 +; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: movzwl %di, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: retq + %t1 = icmp ugt <32 x i8> %a, %b + %t2 = extractelement <32 x i1> %t1, i32 %index + %res = zext i1 %t2 to i8 + ret i8 %res +} + +; TODO - enable this test, currently fail on knl +;define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b, i32 %index) { +; %t1 = icmp ugt <64 x i8> %a, %b +; %t2 = extractelement <64 x i1> %t1, i32 %index +; %res = zext i1 %t2 to i8 +; ret i8 %res +;} + +define i64 @test_extractelement_varible_v2i64(<2 x i64> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v2i64: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: movq -24(%rsp,%rdi,8), %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v2i64: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: movq -24(%rsp,%rdi,8), %rax +; SKX-NEXT: retq + %t2 = extractelement <2 x i64> %t1, i32 %index + ret i64 %t2 +} + +define i64 @test_extractelement_varible_v4i64(<4 x i64> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v4i64: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi6: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi7: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi8: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $3, %edi +; KNL-NEXT: movq (%rsp,%rdi,8), %rax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v4i64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: retq + %t2 = extractelement <4 x i64> %t1, i32 %index + ret i64 %t2 +} + +define i64 @test_extractelement_varible_v8i64(<8 x i64> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v8i64: +; KNL: ## BB#0: +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v8i64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: retq + %t2 = extractelement <8 x i64> %t1, i32 %index + ret i64 %t2 +} + +define double @test_extractelement_varible_v2f64(<2 x double> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v2f64: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v2f64: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: retq + %t2 = extractelement <2 x double> %t1, i32 %index + ret double %t2 +} + +define double @test_extractelement_varible_v4f64(<4 x double> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v4f64: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi9: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi10: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi11: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $3, %edi +; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v4f64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: retq + %t2 = extractelement <4 x double> %t1, i32 %index + ret double %t2 +} + +define double @test_extractelement_varible_v8f64(<8 x double> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v8f64: +; KNL: ## BB#0: +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v8f64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq + %t2 = extractelement <8 x double> %t1, i32 %index + ret double %t2 +} + +define i32 @test_extractelement_varible_v4i32(<4 x i32> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v4i32: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $3, %edi +; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v4i32: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $3, %edi +; SKX-NEXT: movl -24(%rsp,%rdi,4), %eax +; SKX-NEXT: retq + %t2 = extractelement <4 x i32> %t1, i32 %index + ret i32 %t2 +} + +define i32 @test_extractelement_varible_v8i32(<8 x i32> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v8i32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v8i32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: retq + %t2 = extractelement <8 x i32> %t1, i32 %index + ret i32 %t2 +} + +define i32 @test_extractelement_varible_v16i32(<16 x i32> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v16i32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v16i32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: retq + %t2 = extractelement <16 x i32> %t1, i32 %index + ret i32 %t2 +} + +define float @test_extractelement_varible_v4f32(<4 x float> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v4f32: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $3, %edi +; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v4f32: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $3, %edi +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: retq + %t2 = extractelement <4 x float> %t1, i32 %index + ret float %t2 +} + +define float @test_extractelement_varible_v8f32(<8 x float> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v8f32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v8f32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: retq + %t2 = extractelement <8 x float> %t1, i32 %index + ret float %t2 +} + +define float @test_extractelement_varible_v16f32(<16 x float> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v16f32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v16f32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq + %t2 = extractelement <16 x float> %t1, i32 %index + ret float %t2 +} + +define i16 @test_extractelement_varible_v8i16(<8 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v8i16: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v8i16: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $7, %edi +; SKX-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; SKX-NEXT: retq + %t2 = extractelement <8 x i16> %t1, i32 %index + ret i16 %t2 +} + +define i16 @test_extractelement_varible_v16i16(<16 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v16i16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi12: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi13: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi14: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v16i16: +; SKX: ## BB#0: +; SKX-NEXT: movzwl %di, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq + %t2 = extractelement <16 x i16> %t1, i32 %index + ret i16 %t2 +} + +define i16 @test_extractelement_varible_v32i16(<32 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v32i16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi15: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi16: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi17: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $31, %edi +; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_varible_v32i16: +; SKX: ## BB#0: +; SKX-NEXT: movzwl %di, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq + %t2 = extractelement <32 x i16> %t1, i32 %index + ret i16 %t2 +} + +define i8 @test_extractelement_varible_v16i8(<16 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v16i8: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: retq +; +; SKX_ONLY-LABEL: test_extractelement_varible_v16i8: +; SKX_ONLY: ## BB#0: +; SKX_ONLY-NEXT: ## kill: %EDI %EDI %RDI +; SKX_ONLY-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp) +; SKX_ONLY-NEXT: andl $15, %edi +; SKX_ONLY-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; SKX_ONLY-NEXT: movb (%rdi,%rax), %al +; SKX_ONLY-NEXT: retq +; +; CNL-LABEL: test_extractelement_varible_v16i8: +; CNL: ## BB#0: +; CNL-NEXT: movzbl %dil, %eax +; CNL-NEXT: vmovd %eax, %xmm1 +; CNL-NEXT: vpermb %xmm0, %xmm1, %xmm0 +; CNL-NEXT: vpextrb $0, %xmm0, %eax +; CNL-NEXT: ## kill: %AL %AL %EAX +; CNL-NEXT: retq + %t2 = extractelement <16 x i8> %t1, i32 %index + ret i8 %t2 +} + +define i8 @test_extractelement_varible_v32i8(<32 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v32i8: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi18: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi19: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi20: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $31, %edi +; KNL-NEXT: movq %rsp, %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX_ONLY-LABEL: test_extractelement_varible_v32i8: +; SKX_ONLY: ## BB#0: +; SKX_ONLY-NEXT: pushq %rbp +; SKX_ONLY-NEXT: Lcfi0: +; SKX_ONLY-NEXT: .cfi_def_cfa_offset 16 +; SKX_ONLY-NEXT: Lcfi1: +; SKX_ONLY-NEXT: .cfi_offset %rbp, -16 +; SKX_ONLY-NEXT: movq %rsp, %rbp +; SKX_ONLY-NEXT: Lcfi2: +; SKX_ONLY-NEXT: .cfi_def_cfa_register %rbp +; SKX_ONLY-NEXT: andq $-32, %rsp +; SKX_ONLY-NEXT: subq $64, %rsp +; SKX_ONLY-NEXT: ## kill: %EDI %EDI %RDI +; SKX_ONLY-NEXT: vmovdqu %ymm0, (%rsp) +; SKX_ONLY-NEXT: andl $31, %edi +; SKX_ONLY-NEXT: movq %rsp, %rax +; SKX_ONLY-NEXT: movb (%rdi,%rax), %al +; SKX_ONLY-NEXT: movq %rbp, %rsp +; SKX_ONLY-NEXT: popq %rbp +; SKX_ONLY-NEXT: retq +; +; CNL-LABEL: test_extractelement_varible_v32i8: +; CNL: ## BB#0: +; CNL-NEXT: movzbl %dil, %eax +; CNL-NEXT: vmovd %eax, %xmm1 +; CNL-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; CNL-NEXT: vpextrb $0, %xmm0, %eax +; CNL-NEXT: ## kill: %AL %AL %EAX +; CNL-NEXT: retq + %t2 = extractelement <32 x i8> %t1, i32 %index + ret i8 %t2 +} + +define i8 @test_extractelement_varible_v64i8(<64 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_varible_v64i8: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi21: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi22: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi23: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $63, %edi +; KNL-NEXT: movq %rsp, %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX_ONLY-LABEL: test_extractelement_varible_v64i8: +; SKX_ONLY: ## BB#0: +; SKX_ONLY-NEXT: pushq %rbp +; SKX_ONLY-NEXT: Lcfi3: +; SKX_ONLY-NEXT: .cfi_def_cfa_offset 16 +; SKX_ONLY-NEXT: Lcfi4: +; SKX_ONLY-NEXT: .cfi_offset %rbp, -16 +; SKX_ONLY-NEXT: movq %rsp, %rbp +; SKX_ONLY-NEXT: Lcfi5: +; SKX_ONLY-NEXT: .cfi_def_cfa_register %rbp +; SKX_ONLY-NEXT: andq $-64, %rsp +; SKX_ONLY-NEXT: subq $128, %rsp +; SKX_ONLY-NEXT: ## kill: %EDI %EDI %RDI +; SKX_ONLY-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX_ONLY-NEXT: andl $63, %edi +; SKX_ONLY-NEXT: movq %rsp, %rax +; SKX_ONLY-NEXT: movb (%rdi,%rax), %al +; SKX_ONLY-NEXT: movq %rbp, %rsp +; SKX_ONLY-NEXT: popq %rbp +; SKX_ONLY-NEXT: retq +; +; CNL-LABEL: test_extractelement_varible_v64i8: +; CNL: ## BB#0: +; CNL-NEXT: movzbl %dil, %eax +; CNL-NEXT: vmovd %eax, %xmm1 +; CNL-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; CNL-NEXT: vpextrb $0, %xmm0, %eax +; CNL-NEXT: ## kill: %AL %AL %EAX +; CNL-NEXT: retq + %t2 = extractelement <64 x i8> %t1, i32 %index + ret i8 %t2 +}