Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -446,8 +446,7 @@ // Broadcast subvector to vector. SUBV_BROADCAST, - // Insert/Extract vector element. - VINSERT, + // Extract vector element. VEXTRACT, /// SSE4A Extraction and Insertion. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13652,6 +13652,24 @@ DAG.getIntPtrConstant(0, dl)); } +static bool isVPERMVsupported(MVT VT, const X86Subtarget &Subtarget) { + if (Subtarget.hasAVX2() && (VT == MVT::v8f32 || VT == MVT::v8i32)) + return true; // VPERMD/PS + + MVT EltVT = VT.getScalarType(); + if (Subtarget.hasAVX512() && + (EltVT == MVT::f32 || EltVT == MVT::i32 || + EltVT == MVT::f64 || EltVT == MVT::i64)) + return true; // VPERMQ/PD/D/PS + + if (Subtarget.hasBWI() && EltVT == MVT::i16) + return true; // VPERMW + if (Subtarget.hasVBMI() && EltVT == MVT::i8) + return true; // VPERMB + + return false; +} + SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { @@ -13664,21 +13682,28 @@ return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { - if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getScalarSizeInBits() == 32)) { + bool IsVpermv = isVPERMVsupported(VecVT, Subtarget); + if (IsVpermv || + ((VecVT == MVT::v16i8) && Subtarget.hasSSE3())) { //X86ISD::PSHUFB MVT MaskEltVT = MVT::getIntegerVT(VecVT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / - MaskEltVT.getSizeInBits()); + unsigned NumElts = VecVT.getSizeInBits() / MaskEltVT.getSizeInBits(); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, NumElts); Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, - DAG.getConstant(0, dl, PtrVT)); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); + // create BUILD_VECTOR, it will be matched as movd/movq/movss/movsd. + SmallVector Ops; + Ops.append(NumElts, DAG.getConstant(0, dl, MaskEltVT)); + Ops[0] = Idx; + SDValue Mask = DAG.getBuildVector(MaskVT, dl, Ops); + + SDValue Perm; + if (IsVpermv) + Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); + else + Perm = DAG.getNode(X86ISD::PSHUFB, dl, VecVT, Vec, Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, DAG.getConstant(0, dl, PtrVT)); } @@ -23832,7 +23857,6 @@ case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; @@ -26955,15 +26979,7 @@ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros && - ((Subtarget.hasAVX2() && - (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || - (Subtarget.hasAVX512() && - (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || - MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || - (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || - (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || - (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { + isVPERMVsupported(MaskVT, Subtarget)) { MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3579,19 +3579,6 @@ def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; } - -def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - -def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// @@ -4855,9 +4842,10 @@ // Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_var_shift_lowering p> { - let Predicates = p in { - def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + SDNode OpNode, Predicate pAll, + Predicate p128, Predicate p256> { + let Predicates = [pAll, p256] in + def : Pat<(_.info256.VT (OpNode _.info256.RC:$src1, (_.info256.VT _.info256.RC:$src2))), (EXTRACT_SUBREG (!cast(OpcodeStr#"Zrr") @@ -4865,15 +4853,16 @@ (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + let Predicates = [pAll, p128] in + def : Pat<(_.info128.VT (OpNode _.info128.RC:$src1, (_.info128.VT _.info128.RC:$src2))), (EXTRACT_SUBREG (!cast(OpcodeStr#"Zrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; - } } + multiclass avx512_var_shift_w opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -4900,10 +4889,10 @@ defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; // Special handing for handling VPSRAV intrinsics. multiclass avx512_var_shift_int_lowering, VEX_W; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; + +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; + defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", X86VPermi, avx512vl_i64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -437,9 +437,6 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; -def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, - SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4706,19 +4706,6 @@ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt - -def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - -def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int // Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi | FileCheck --check-prefix=SKX --check-prefix=SKX_VBMI %s define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; KNL-LABEL: test1: @@ -1446,3 +1447,473 @@ %res = select i1 %t2, i8 3, i8 4 ret i8 %res } + +define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v2i64: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v2i64: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: retq + %t2 = extractelement <2 x i64> %t1, i32 %index + ret i64 %t2 +} + +define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v4i64: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v4i64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: retq + %t2 = extractelement <4 x i64> %t1, i32 %index + ret i64 %t2 +} + +define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v8i64: +; KNL: ## BB#0: +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v8i64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: retq + %t2 = extractelement <8 x i64> %t1, i32 %index + ret i64 %t2 +} + +define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v2f64: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v2f64: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq + %t2 = extractelement <2 x double> %t1, i32 %index + ret double %t2 +} + +define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v4f64: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v4f64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: retq + %t2 = extractelement <4 x double> %t1, i32 %index + ret double %t2 +} + +define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v8f64: +; KNL: ## BB#0: +; KNL-NEXT: movslq %edi, %rax +; KNL-NEXT: vmovq %rax, %xmm1 +; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v8f64: +; SKX: ## BB#0: +; SKX-NEXT: movslq %edi, %rax +; SKX-NEXT: vmovq %rax, %xmm1 +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq + %t2 = extractelement <8 x double> %t1, i32 %index + ret double %t2 +} + +define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v4i32: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v4i32: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: retq + %t2 = extractelement <4 x i32> %t1, i32 %index + ret i32 %t2 +} + +define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v8i32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v8i32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: retq + %t2 = extractelement <8 x i32> %t1, i32 %index + ret i32 %t2 +} + +define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v16i32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v16i32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: retq + %t2 = extractelement <16 x i32> %t1, i32 %index + ret i32 %t2 +} + +define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v4f32: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v4f32: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq + %t2 = extractelement <4 x float> %t1, i32 %index + ret float %t2 +} + +define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v8f32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v8f32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: retq + %t2 = extractelement <8 x float> %t1, i32 %index + ret float %t2 +} + +define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v16f32: +; KNL: ## BB#0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v16f32: +; SKX: ## BB#0: +; SKX-NEXT: vmovd %edi, %xmm1 +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: retq + %t2 = extractelement <16 x float> %t1, i32 %index + ret float %t2 +} + +define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v8i16: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: andl $7, %edi +; KNL-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v8i16: +; SKX: ## BB#0: +; SKX-NEXT: movzwl %di, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpermw %xmm0, %xmm1, %xmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq + %t2 = extractelement <8 x i16> %t1, i32 %index + ret i16 %t2 +} + +define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v16i16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi3: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi4: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi5: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v16i16: +; SKX: ## BB#0: +; SKX-NEXT: movzwl %di, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq + %t2 = extractelement <16 x i16> %t1, i32 %index + ret i16 %t2 +} + +define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v32i16: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi6: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi7: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi8: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $31, %edi +; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v32i16: +; SKX: ## BB#0: +; SKX-NEXT: movzwl %di, %eax +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: retq + %t2 = extractelement <32 x i16> %t1, i32 %index + ret i16 %t2 +} + +define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v16i8: +; KNL: ## BB#0: +; KNL-NEXT: movzbl %dil, %eax +; KNL-NEXT: vmovd %eax, %xmm1 +; KNL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpextrb $0, %xmm0, %eax +; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: retq +; +; SKX_ONLY-LABEL: test_extractelement_variable_v16i8: +; SKX_ONLY: ## BB#0: +; SKX_ONLY-NEXT: movzbl %dil, %eax +; SKX_ONLY-NEXT: vmovd %eax, %xmm1 +; SKX_ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; SKX_ONLY-NEXT: vpextrb $0, %xmm0, %eax +; SKX_ONLY-NEXT: ## kill: %AL %AL %EAX +; SKX_ONLY-NEXT: retq +; +; SKX_VBMI-LABEL: test_extractelement_variable_v16i8: +; SKX_VBMI: ## BB#0: +; SKX_VBMI-NEXT: movzbl %dil, %eax +; SKX_VBMI-NEXT: vmovd %eax, %xmm1 +; SKX_VBMI-NEXT: vpermb %xmm0, %xmm1, %xmm0 +; SKX_VBMI-NEXT: vpextrb $0, %xmm0, %eax +; SKX_VBMI-NEXT: ## kill: %AL %AL %EAX +; SKX_VBMI-NEXT: retq + %t2 = extractelement <16 x i8> %t1, i32 %index + ret i8 %t2 +} + +define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v32i8: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi9: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi10: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi11: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $31, %edi +; KNL-NEXT: movq %rsp, %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX_ONLY-LABEL: test_extractelement_variable_v32i8: +; SKX_ONLY: ## BB#0: +; SKX_ONLY-NEXT: pushq %rbp +; SKX_ONLY-NEXT: Lcfi0: +; SKX_ONLY-NEXT: .cfi_def_cfa_offset 16 +; SKX_ONLY-NEXT: Lcfi1: +; SKX_ONLY-NEXT: .cfi_offset %rbp, -16 +; SKX_ONLY-NEXT: movq %rsp, %rbp +; SKX_ONLY-NEXT: Lcfi2: +; SKX_ONLY-NEXT: .cfi_def_cfa_register %rbp +; SKX_ONLY-NEXT: andq $-32, %rsp +; SKX_ONLY-NEXT: subq $64, %rsp +; SKX_ONLY-NEXT: ## kill: %EDI %EDI %RDI +; SKX_ONLY-NEXT: vmovdqu %ymm0, (%rsp) +; SKX_ONLY-NEXT: andl $31, %edi +; SKX_ONLY-NEXT: movq %rsp, %rax +; SKX_ONLY-NEXT: movb (%rdi,%rax), %al +; SKX_ONLY-NEXT: movq %rbp, %rsp +; SKX_ONLY-NEXT: popq %rbp +; SKX_ONLY-NEXT: retq +; +; SKX_VBMI-LABEL: test_extractelement_variable_v32i8: +; SKX_VBMI: ## BB#0: +; SKX_VBMI-NEXT: movzbl %dil, %eax +; SKX_VBMI-NEXT: vmovd %eax, %xmm1 +; SKX_VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; SKX_VBMI-NEXT: vpextrb $0, %xmm0, %eax +; SKX_VBMI-NEXT: ## kill: %AL %AL %EAX +; SKX_VBMI-NEXT: retq + + %t2 = extractelement <32 x i8> %t1, i32 %index + ret i8 %t2 +} + +define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { +; KNL-LABEL: test_extractelement_variable_v64i8: +; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi12: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: Lcfi13: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: Lcfi14: +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $128, %rsp +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: andl $63, %edi +; KNL-NEXT: movq %rsp, %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX_ONLY-LABEL: test_extractelement_variable_v64i8: +; SKX_ONLY: ## BB#0: +; SKX_ONLY-NEXT: pushq %rbp +; SKX_ONLY-NEXT: Lcfi3: +; SKX_ONLY-NEXT: .cfi_def_cfa_offset 16 +; SKX_ONLY-NEXT: Lcfi4: +; SKX_ONLY-NEXT: .cfi_offset %rbp, -16 +; SKX_ONLY-NEXT: movq %rsp, %rbp +; SKX_ONLY-NEXT: Lcfi5: +; SKX_ONLY-NEXT: .cfi_def_cfa_register %rbp +; SKX_ONLY-NEXT: andq $-64, %rsp +; SKX_ONLY-NEXT: subq $128, %rsp +; SKX_ONLY-NEXT: ## kill: %EDI %EDI %RDI +; SKX_ONLY-NEXT: vmovdqu8 %zmm0, (%rsp) +; SKX_ONLY-NEXT: andl $63, %edi +; SKX_ONLY-NEXT: movq %rsp, %rax +; SKX_ONLY-NEXT: movb (%rdi,%rax), %al +; SKX_ONLY-NEXT: movq %rbp, %rsp +; SKX_ONLY-NEXT: popq %rbp +; SKX_ONLY-NEXT: retq +; +; SKX_VBMI-LABEL: test_extractelement_variable_v64i8: +; SKX_VBMI: ## BB#0: +; SKX_VBMI-NEXT: movzbl %dil, %eax +; SKX_VBMI-NEXT: vmovd %eax, %xmm1 +; SKX_VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; SKX_VBMI-NEXT: vpextrb $0, %xmm0, %eax +; SKX_VBMI-NEXT: ## kill: %AL %AL %EAX +; SKX_VBMI-NEXT: retq + + %t2 = extractelement <64 x i8> %t1, i32 %index + ret i8 %t2 +} Index: test/CodeGen/X86/extractelement-index.ll =================================================================== --- test/CodeGen/X86/extractelement-index.ll +++ test/CodeGen/X86/extractelement-index.ll @@ -402,20 +402,30 @@ ; define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind { -; SSE-LABEL: extractelement_v16i8_var: -; SSE: # BB#0: -; SSE-NEXT: andl $15, %edi -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movb (%rdi,%rax), %al -; SSE-NEXT: retq +; SSE2-LABEL: extractelement_v16i8_var: +; SSE2: # BB#0: +; SSE2-NEXT: andl $15, %edi +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movb (%rdi,%rax), %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: extractelement_v16i8_var: +; SSE41: # BB#0: +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: # kill: %AL %AL %EAX +; SSE41-NEXT: retq ; ; AVX-LABEL: extractelement_v16i8_var: ; AVX: # BB#0: -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movb (%rdi,%rax), %al +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: %AL %AL %EAX ; AVX-NEXT: retq %b = extractelement <16 x i8> %a, i256 %i ret i8 %b Index: test/CodeGen/X86/vector-shuffle-variable-128.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-variable-128.ll +++ test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -494,237 +494,302 @@ ; ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: # kill: %R9D %R9D %R9 -; SSSE3-NEXT: # kill: %R8D %R8D %R8 -; SSSE3-NEXT: # kill: %ECX %ECX %RCX -; SSSE3-NEXT: # kill: %EDX %EDX %RDX -; SSSE3-NEXT: # kill: %ESI %ESI %RSI -; SSSE3-NEXT: # kill: %EDI %EDI %RDI -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; SSSE3-NEXT: andl $15, %r10d -; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r11 -; SSSE3-NEXT: movzbl (%r10,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm15 +; SSSE3-NEXT: subq $136, %rsp +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl %sil, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl %dl, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl %cl, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl %r8b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl %r9b, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm8 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm9 -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: andl $15, %edi -; SSSE3-NEXT: movzbl (%rdi,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: andl $15, %r8d -; SSSE3-NEXT: movzbl (%r8,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm7 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm13 -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, (%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: andl $15, %esi -; SSSE3-NEXT: movzbl (%rsi,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: andl $15, %r9d -; SSSE3-NEXT: movzbl (%r9,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: addq $136, %rsp ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSE41: # BB#0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: # kill: %R9D %R9D %R9 -; SSE41-NEXT: # kill: %R8D %R8D %R8 -; SSE41-NEXT: # kill: %ECX %ECX %RCX -; SSE41-NEXT: # kill: %EDX %EDX %RDX -; SSE41-NEXT: # kill: %ESI %ESI %RSI -; SSE41-NEXT: # kill: %EDI %EDI %RDI -; SSE41-NEXT: andl $15, %edi -; SSE41-NEXT: andl $15, %esi -; SSE41-NEXT: andl $15, %edx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: andl $15, %r8d -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: andl $15, %r9d -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; SSE41-NEXT: andl $15, %r10d -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE41-NEXT: andl $15, %r11d -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; SSE41-NEXT: andl $15, %r14d -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; SSE41-NEXT: andl $15, %r15d -; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: movzbl (%rdi,%rax), %edi -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; SSE41-NEXT: andl $15, %r12d -; SSE41-NEXT: pinsrb $1, (%rsi,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; SSE41-NEXT: andl $15, %esi -; SSE41-NEXT: pinsrb $2, (%rdx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; SSE41-NEXT: andl $15, %edx -; SSE41-NEXT: pinsrb $3, (%rcx,%rax), %xmm0 +; SSE41-NEXT: movzbl %dil, %eax +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: movzbl %sil, %eax +; SSE41-NEXT: movd %eax, %xmm3 +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: movzbl %cl, %eax +; SSE41-NEXT: movd %eax, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: movzbl %r8b, %eax +; SSE41-NEXT: movd %eax, %xmm9 +; SSE41-NEXT: pextrb $0, %xmm4, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm3, %xmm7 +; SSE41-NEXT: movzbl %r9b, %ecx +; SSE41-NEXT: movd %ecx, %xmm10 ; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; SSE41-NEXT: andl $15, %ebx -; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; SSE41-NEXT: andl $15, %edi -; SSE41-NEXT: movzbl (%r10,%rax), %r8d -; SSE41-NEXT: movzbl (%r11,%rax), %r9d -; SSE41-NEXT: movzbl (%r14,%rax), %r10d -; SSE41-NEXT: movzbl (%r15,%rax), %r11d -; SSE41-NEXT: movzbl (%r12,%rax), %ebp -; SSE41-NEXT: movzbl (%rsi,%rax), %esi -; SSE41-NEXT: movzbl (%rdx,%rax), %edx -; SSE41-NEXT: movzbl (%rcx,%rax), %ecx -; SSE41-NEXT: movzbl (%rbx,%rax), %ebx -; SSE41-NEXT: movzbl (%rdi,%rax), %eax -; SSE41-NEXT: pinsrb $6, %r8d, %xmm0 -; SSE41-NEXT: pinsrb $7, %r9d, %xmm0 -; SSE41-NEXT: pinsrb $8, %r10d, %xmm0 -; SSE41-NEXT: pinsrb $9, %r11d, %xmm0 -; SSE41-NEXT: pinsrb $10, %ebp, %xmm0 -; SSE41-NEXT: pinsrb $11, %esi, %xmm0 -; SSE41-NEXT: pinsrb $12, %edx, %xmm0 -; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $14, %ebx, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp +; SSE41-NEXT: movd %ecx, %xmm11 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm12 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm13 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm14 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm15 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm5 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm6 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm2 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movd %ecx, %xmm4 +; SSE41-NEXT: pextrb $0, %xmm7, %ecx +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm1, %xmm7 +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm8, %xmm7 +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm9, %xmm7 +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm10, %xmm7 +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm11, %xmm7 +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm12, %xmm7 +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm13, %xmm7 +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm14, %xmm7 +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm15, %xmm7 +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm7 +; SSE41-NEXT: pshufb %xmm5, %xmm7 +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm7, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufb %xmm6, %xmm5 +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm5, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pshufb %xmm2, %xmm5 +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm5, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm2 +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: pshufb %xmm4, %xmm0 +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; AVX: # BB#0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: # kill: %R9D %R9D %R9 -; AVX-NEXT: # kill: %R8D %R8D %R8 -; AVX-NEXT: # kill: %ECX %ECX %RCX -; AVX-NEXT: # kill: %EDX %EDX %RDX -; AVX-NEXT: # kill: %ESI %ESI %RSI -; AVX-NEXT: # kill: %EDI %EDI %RDI -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; AVX-NEXT: andl $15, %r14d -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; AVX-NEXT: andl $15, %r15d -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movzbl (%rdi,%rax), %edi -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; AVX-NEXT: andl $15, %r12d -; AVX-NEXT: vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: movzbl %sil, %eax +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: movzbl %dl, %eax +; AVX-NEXT: vmovd %eax, %xmm8 +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vmovd %eax, %xmm9 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX-NEXT: movzbl %r8b, %eax +; AVX-NEXT: vmovd %eax, %xmm10 +; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm12 +; AVX-NEXT: movzbl %r9b, %ecx +; AVX-NEXT: vmovd %ecx, %xmm11 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm13 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm14 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm15 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm2 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm3 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm4 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm5 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm6 ; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $4, (%r8,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: vpinsrb $5, (%r9,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: movzbl (%r10,%rax), %r8d -; AVX-NEXT: movzbl (%r11,%rax), %r9d -; AVX-NEXT: movzbl (%r14,%rax), %r10d -; AVX-NEXT: movzbl (%r15,%rax), %r11d -; AVX-NEXT: movzbl (%r12,%rax), %ebp -; AVX-NEXT: movzbl (%rsi,%rax), %esi -; AVX-NEXT: movzbl (%rdx,%rax), %edx -; AVX-NEXT: movzbl (%rcx,%rax), %ecx -; AVX-NEXT: movzbl (%rbx,%rax), %ebx -; AVX-NEXT: movzbl (%rdi,%rax), %eax -; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $7, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, %ebx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vmovd %ecx, %xmm7 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpextrb $0, %xmm12, %ecx +; AVX-NEXT: vmovd %eax, %xmm12 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm12, %xmm12 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $2, %eax, %xmm12, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $3, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $4, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $5, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $6, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $7, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $8, %eax, %xmm9, %xmm8 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $9, %eax, %xmm8, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm1 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <16 x i8> %x, i8 %i0 %x1 = extractelement <16 x i8> %x, i8 %i1 @@ -946,257 +1011,303 @@ ; ; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSSE3: # BB#0: +; SSSE3-NEXT: subq $136, %rsp ; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzbl 8(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm8 -; SSSE3-NEXT: movzbl 12(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm9 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 1(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 2(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 3(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl 4(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movzbl 14(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm10 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 5(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl 6(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: movzbl 10(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: movzbl 2(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: movzbl 15(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl 7(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 8(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 9(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, (%rsp) +; SSSE3-NEXT: movzbl 10(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl 11(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm13 -; SSSE3-NEXT: movzbl 3(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 12(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl 13(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: movzbl 5(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: movzbl 9(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm15 -; SSSE3-NEXT: movzbl 1(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 14(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl 15(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: addq $136, %rsp ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSE41: # BB#0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: pushq %r15 -; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 -; SSE41-NEXT: pushq %r12 -; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movzbl (%rdi), %r11d -; SSE41-NEXT: andl $15, %r11d -; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movzbl 1(%rdi), %r9d -; SSE41-NEXT: andl $15, %r9d +; SSE41-NEXT: movzbl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm9 ; SSE41-NEXT: movzbl 2(%rdi), %eax -; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; SSE41-NEXT: movd %eax, %xmm10 ; SSE41-NEXT: movzbl 3(%rdi), %eax -; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; SSE41-NEXT: movzbl 4(%rdi), %r14d -; SSE41-NEXT: andl $15, %r14d -; SSE41-NEXT: movzbl 5(%rdi), %r15d -; SSE41-NEXT: andl $15, %r15d -; SSE41-NEXT: movzbl 6(%rdi), %r12d -; SSE41-NEXT: andl $15, %r12d -; SSE41-NEXT: movzbl 7(%rdi), %r13d -; SSE41-NEXT: andl $15, %r13d -; SSE41-NEXT: movzbl 8(%rdi), %r8d -; SSE41-NEXT: andl $15, %r8d +; SSE41-NEXT: movd %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE41-NEXT: movzbl 4(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm11 +; SSE41-NEXT: movzbl 5(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm12 +; SSE41-NEXT: movzbl 6(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm13 +; SSE41-NEXT: movzbl 7(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm14 +; SSE41-NEXT: movzbl 8(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm15 ; SSE41-NEXT: movzbl 9(%rdi), %eax -; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: movzbl 10(%rdi), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: movzbl 11(%rdi), %edx -; SSE41-NEXT: andl $15, %edx -; SSE41-NEXT: movzbl 12(%rdi), %esi -; SSE41-NEXT: andl $15, %esi -; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movzbl (%r11,%rbp), %ebx -; SSE41-NEXT: movd %ebx, %xmm0 -; SSE41-NEXT: movzbl 13(%rdi), %r11d -; SSE41-NEXT: andl $15, %r11d -; SSE41-NEXT: pinsrb $1, (%r9,%rbp), %xmm0 -; SSE41-NEXT: movzbl 14(%rdi), %ebx -; SSE41-NEXT: andl $15, %ebx -; SSE41-NEXT: movzbl 15(%rdi), %edi -; SSE41-NEXT: andl $15, %edi -; SSE41-NEXT: movzbl (%rdi,%rbp), %r10d -; SSE41-NEXT: movzbl (%rbx,%rbp), %r9d -; SSE41-NEXT: movzbl (%r11,%rbp), %r11d -; SSE41-NEXT: movzbl (%rsi,%rbp), %esi -; SSE41-NEXT: movzbl (%rdx,%rbp), %edx -; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx -; SSE41-NEXT: movzbl (%rax,%rbp), %eax -; SSE41-NEXT: movzbl (%r8,%rbp), %r8d -; SSE41-NEXT: movzbl (%r13,%rbp), %r13d -; SSE41-NEXT: movzbl (%r12,%rbp), %r12d -; SSE41-NEXT: movzbl (%r15,%rbp), %r15d -; SSE41-NEXT: movzbl (%r14,%rbp), %r14d -; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload -; SSE41-NEXT: movzbl (%rdi,%rbp), %edi -; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload -; SSE41-NEXT: movzbl (%rbx,%rbp), %ebp -; SSE41-NEXT: pinsrb $2, %ebp, %xmm0 -; SSE41-NEXT: pinsrb $3, %edi, %xmm0 -; SSE41-NEXT: pinsrb $4, %r14d, %xmm0 -; SSE41-NEXT: pinsrb $5, %r15d, %xmm0 -; SSE41-NEXT: pinsrb $6, %r12d, %xmm0 -; SSE41-NEXT: pinsrb $7, %r13d, %xmm0 -; SSE41-NEXT: pinsrb $8, %r8d, %xmm0 -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $11, %edx, %xmm0 -; SSE41-NEXT: pinsrb $12, %esi, %xmm0 -; SSE41-NEXT: pinsrb $13, %r11d, %xmm0 -; SSE41-NEXT: pinsrb $14, %r9d, %xmm0 -; SSE41-NEXT: pinsrb $15, %r10d, %xmm0 -; SSE41-NEXT: popq %rbx -; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 -; SSE41-NEXT: popq %r14 -; SSE41-NEXT: popq %r15 -; SSE41-NEXT: popq %rbp +; SSE41-NEXT: movd %eax, %xmm8 +; SSE41-NEXT: movzbl 10(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm5 +; SSE41-NEXT: movzbl 11(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm6 +; SSE41-NEXT: movzbl 12(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm7 +; SSE41-NEXT: movzbl 13(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm4 +; SSE41-NEXT: movzbl 14(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm3 +; SSE41-NEXT: pextrb $0, %xmm1, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb %xmm9, %xmm1 +; SSE41-NEXT: movzbl 15(%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm9 +; SSE41-NEXT: pextrb $0, %xmm1, %ecx +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm10, %xmm2 +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm11, %xmm2 +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm12, %xmm2 +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm13, %xmm2 +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm14, %xmm2 +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm15, %xmm2 +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm8, %xmm2 +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm5, %xmm2 +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm6, %xmm2 +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm7, %xmm2 +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm4, %xmm2 +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb %xmm3, %xmm2 +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm2, %eax +; SSE41-NEXT: pshufb %xmm9, %xmm0 +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; AVX: # BB#0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movzbl (%rdi), %r11d -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzbl 1(%rdi), %r9d -; AVX-NEXT: andl $15, %r9d +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm9 +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm10 ; AVX-NEXT: movzbl 2(%rdi), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; AVX-NEXT: vmovd %eax, %xmm8 ; AVX-NEXT: movzbl 3(%rdi), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; AVX-NEXT: movzbl 4(%rdi), %r14d -; AVX-NEXT: andl $15, %r14d -; AVX-NEXT: movzbl 5(%rdi), %r15d -; AVX-NEXT: andl $15, %r15d -; AVX-NEXT: movzbl 6(%rdi), %r12d -; AVX-NEXT: andl $15, %r12d -; AVX-NEXT: movzbl 7(%rdi), %r13d -; AVX-NEXT: andl $15, %r13d -; AVX-NEXT: movzbl 8(%rdi), %r8d -; AVX-NEXT: andl $15, %r8d +; AVX-NEXT: vmovd %eax, %xmm11 +; AVX-NEXT: movzbl 4(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm12 +; AVX-NEXT: movzbl 5(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm13 +; AVX-NEXT: movzbl 6(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm14 +; AVX-NEXT: movzbl 7(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm15 +; AVX-NEXT: movzbl 8(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm2 ; AVX-NEXT: movzbl 9(%rdi), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movzbl 10(%rdi), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: movzbl 11(%rdi), %edx -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: movzbl 12(%rdi), %esi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp -; AVX-NEXT: movzbl (%r11,%rbp), %ebx -; AVX-NEXT: vmovd %ebx, %xmm0 -; AVX-NEXT: movzbl 13(%rdi), %r11d -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: vpinsrb $1, (%r9,%rbp), %xmm0, %xmm0 -; AVX-NEXT: movzbl 14(%rdi), %ebx -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: movzbl 15(%rdi), %edi -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: movzbl (%rdi,%rbp), %r10d -; AVX-NEXT: movzbl (%rbx,%rbp), %r9d -; AVX-NEXT: movzbl (%r11,%rbp), %r11d -; AVX-NEXT: movzbl (%rsi,%rbp), %esi -; AVX-NEXT: movzbl (%rdx,%rbp), %edx -; AVX-NEXT: movzbl (%rcx,%rbp), %ecx -; AVX-NEXT: movzbl (%rax,%rbp), %eax -; AVX-NEXT: movzbl (%r8,%rbp), %r8d -; AVX-NEXT: movzbl (%r13,%rbp), %r13d -; AVX-NEXT: movzbl (%r12,%rbp), %r12d -; AVX-NEXT: movzbl (%r15,%rbp), %r15d -; AVX-NEXT: movzbl (%r14,%rbp), %r14d -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload -; AVX-NEXT: movzbl (%rdi,%rbp), %edi -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload -; AVX-NEXT: movzbl (%rbx,%rbp), %ebp -; AVX-NEXT: vpinsrb $2, %ebp, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %r10d, %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vmovd %eax, %xmm3 +; AVX-NEXT: movzbl 10(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm4 +; AVX-NEXT: movzbl 11(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm5 +; AVX-NEXT: movzbl 12(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm6 +; AVX-NEXT: movzbl 13(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm7 +; AVX-NEXT: movzbl 14(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpextrb $0, %xmm9, %eax +; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm9 +; AVX-NEXT: movzbl 15(%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm10 +; AVX-NEXT: vpextrb $0, %xmm9, %ecx +; AVX-NEXT: vmovd %eax, %xmm9 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $2, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $3, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $4, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $5, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX-NEXT: vpinsrb $6, %eax, %xmm9, %xmm9 +; AVX-NEXT: vpextrb $0, %xmm8, %eax +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $7, %eax, %xmm9, %xmm8 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $8, %eax, %xmm8, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm2, %eax +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm2 +; AVX-NEXT: vpextrb $0, %xmm1, %eax +; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm1 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %p0 = getelementptr inbounds i8, i8* %i, i64 0 %p1 = getelementptr inbounds i8, i8* %i, i64 1