Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18704,6 +18704,53 @@ SDValue UndefV = LHS.getOperand(1); return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); } + + SDLoc DL(N); + SDValue UndefV = DAG.getUNDEF(VT); + auto isMaskSafeForReorderedShuffle = [&](bool IsShufOp0) { + // If the shuffle mask has no undefined elements, the mask is safe. + ShuffleVectorSDNode *Shuf = IsShufOp0 ? Shuf0 : Shuf1; + if (all_of(Shuf->getMask(), [](int M) { return M != -1; })) + return true; + + // If the binop would create undefined elements for the lanes where the + // shuffle does too, the mask is safe. If the mask has undefined elements, + // but the binop would not produce undefined elements, it is not safe (the + // new shuffle could produce undefined elements where the original code + // did not). + return IsShufOp0 ? DAG.getNode(Opcode, DL, VT, UndefV, RHS).isUndef() + : DAG.getNode(Opcode, DL, VT, LHS, UndefV).isUndef(); + }; + + // Move a splat after a binop to increase shuffle combining and demanded + // elements opportunities. + unsigned NumElts = VT.getVectorNumElements(); + if (Shuf0 && Shuf0->isSplat() && Shuf0->hasOneUse() && + (isConstOrConstSplat(RHS) || isConstOrConstSplatFP(RHS))) { + // bo (splat X), SplatC --> splat (bo X, SplatC) + SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0), RHS, + N->getFlags()); + // Re-use the existing shuffle mask if possible to preserve undef lanes. + if (isMaskSafeForReorderedShuffle(true)) + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); + + // We can't preserve undef lanes, so create a "full" splat. + SmallVector SplatMask(NumElts, Shuf0->getSplatIndex()); + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, SplatMask); + } + if (Shuf1 && Shuf1->isSplat() && Shuf1->hasOneUse() && + (isConstOrConstSplat(LHS) || isConstOrConstSplatFP(LHS))) { + // bo SplatC, (splat X) --> splat (bo SplatC, X) + SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, RHS.getOperand(0), + N->getFlags()); + // Re-use the existing shuffle mask if possible to preserve undef lanes. + if (isMaskSafeForReorderedShuffle(false)) + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf1->getMask()); + + // We can't preserve undef lanes, so create a "full" splat. + SmallVector SplatMask(NumElts, Shuf1->getSplatIndex()); + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, SplatMask); + } } // The following pattern is likely to emerge with vector reduction ops. Moving Index: llvm/test/CodeGen/ARM/reg_sequence.ll =================================================================== --- llvm/test/CodeGen/ARM/reg_sequence.ll +++ llvm/test/CodeGen/ARM/reg_sequence.ll @@ -273,7 +273,7 @@ entry: ; CHECK-LABEL: t10: ; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000 -; CHECK: vmul.f32 q8, q9, d1[0] +; CHECK: vmul.f32 q8, q8, q[[Q0]] ; CHECK: vadd.f32 q8, q8, q8 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] %1 = insertelement <4 x float> %0, float %x, i32 1 ; <<4 x float>> [#uses=1] Index: llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -9653,7 +9653,8 @@ ; ; X64-LABEL: test_mask_mul_epi32_rmb_128: ; X64: # %bb.0: -; X64-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x0f] +; X64-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 @@ -9676,8 +9677,9 @@ ; ; X64-LABEL: test_mask_mul_epi32_rmbk_128: ; X64: # %bb.0: +; X64-NEXT: vpbroadcastd (%rdi), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x17] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f] +; X64-NEXT: vpmuldq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x28,0xca] ; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load i64, i64* %ptr_b @@ -9700,8 +9702,9 @@ ; ; X64-LABEL: test_mask_mul_epi32_rmbkz_128: ; X64: # %bb.0: +; X64-NEXT: vpbroadcastd (%rdi), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x0f] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07] +; X64-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 Index: llvm/test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -236,9 +236,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -278,9 +276,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -841,9 +837,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -904,9 +898,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1684,9 +1676,7 @@ ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1757,9 +1747,7 @@ ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 Index: llvm/test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -238,9 +238,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -274,9 +272,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -781,9 +777,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -835,9 +829,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1588,9 +1580,7 @@ ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1652,9 +1642,7 @@ ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 Index: llvm/test/CodeGen/X86/scalarize-fp.ll =================================================================== --- llvm/test/CodeGen/X86/scalarize-fp.ll +++ llvm/test/CodeGen/X86/scalarize-fp.ll @@ -539,14 +539,14 @@ define <2 x double> @fadd_splat_const_op1_v2f64(<2 x double> %vx) { ; SSE-LABEL: fadd_splat_const_op1_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] -; SSE-NEXT: addpd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fadd_splat_const_op1_v2f64: ; AVX: # %bb.0: +; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> zeroinitializer %r = fadd <2 x double> %splatx, @@ -580,14 +580,14 @@ define <4 x float> @fmul_splat_const_op1_v4f32(<4 x float> %vx, <4 x float> %vy) { ; SSE-LABEL: fmul_splat_const_op1_v4f32: ; SSE: # %bb.0: +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fmul_splat_const_op1_v4f32: ; AVX: # %bb.0: +; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer %r = fmul fast <4 x float> %splatx, @@ -597,28 +597,18 @@ define <8 x float> @fdiv_splat_const_op0_v8f32(<8 x float> %vy) { ; SSE-LABEL: fdiv_splat_const_op0_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_splat_const_op0_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> , <8 x float> undef, <8 x i32> zeroinitializer %splaty = shufflevector <8 x float> %vy, <8 x float> undef, <8 x i32> zeroinitializer @@ -629,22 +619,18 @@ define <8 x float> @fdiv_const_op1_splat_v8f32(<8 x float> %vx) { ; SSE-LABEL: fdiv_const_op1_splat_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: rcpps %xmm1, %xmm1 -; SSE-NEXT: addps %xmm1, %xmm1 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_const_op1_splat_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vrcpps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer %splaty = shufflevector <8 x float> , <8 x float> undef, <8 x i32> zeroinitializer Index: llvm/test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1210,10 +1210,10 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64] ; SSE2-NEXT: psubq %xmm2, %xmm4 ; SSE2-NEXT: psrlq %xmm4, %xmm1 @@ -1230,25 +1230,25 @@ ; SSE41-LABEL: splatvar_funnnel_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq %xmm4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psllq %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64] -; SSE41-NEXT: psubq %xmm4, %xmm0 +; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: psrlq %xmm0, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1260,9 +1260,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1275,15 +1275,14 @@ ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1291,15 +1290,14 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq @@ -1307,15 +1305,14 @@ ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1324,15 +1321,14 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1340,15 +1336,14 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1361,9 +1356,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1375,9 +1370,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1389,29 +1384,27 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm4, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] -; X32-SSE-NEXT: psubq %xmm2, %xmm3 -; X32-SSE-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE-NEXT: psrlq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] -; X32-SSE-NEXT: pand %xmm3, %xmm2 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: movl $63, %eax +; X32-SSE-NEXT: movd %eax, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] +; X32-SSE-NEXT: pand %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,0,64,0] +; X32-SSE-NEXT: psubq %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psrlq %xmm4, %xmm2 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psllq %xmm3, %xmm2 +; X32-SSE-NEXT: por %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm0 +; X32-SSE-NEXT: pandn %xmm2, %xmm5 +; X32-SSE-NEXT: por %xmm5, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) @@ -1421,8 +1414,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: xorps %xmm4, %xmm4 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1434,6 +1426,7 @@ ; SSE2-NEXT: movd %ecx, %xmm4 ; SSE2-NEXT: psrld %xmm4, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -1443,28 +1436,28 @@ ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pslld %xmm0, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pslld %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32] -; SSE41-NEXT: psubd %xmm4, %xmm0 +; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: psrld %xmm0, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1477,11 +1470,11 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1495,17 +1488,17 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1513,17 +1506,16 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm3, %xmm0, %xmm3 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq @@ -1531,17 +1523,17 @@ ; AVX512BW-LABEL: splatvar_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1550,17 +1542,17 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1568,17 +1560,16 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm3, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1591,10 +1582,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1607,11 +1598,11 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1624,8 +1615,7 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 ; X32-SSE-NEXT: xorps %xmm4, %xmm4 ; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1637,6 +1627,7 @@ ; X32-SSE-NEXT: movd %ecx, %xmm4 ; X32-SSE-NEXT: psrld %xmm4, %xmm1 ; X32-SSE-NEXT: por %xmm5, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X32-SSE-NEXT: pand %xmm2, %xmm0 ; X32-SSE-NEXT: pandn %xmm1, %xmm2 @@ -1650,53 +1641,53 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psllw %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm2, %xmm3 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrlw %xmm3, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpeqw %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psllw %xmm0, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psllw %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16] -; SSE41-NEXT: psubw %xmm4, %xmm0 +; SSE41-NEXT: psubw %xmm2, %xmm0 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: psrlw %xmm0, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1709,10 +1700,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1725,10 +1716,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1741,10 +1732,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1758,17 +1749,16 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %xmm0, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1777,17 +1767,16 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm5, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1795,17 +1784,16 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1818,11 +1806,11 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1835,10 +1823,10 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1851,24 +1839,24 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm2, %xmm3 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm4 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllw %xmm2, %xmm5 +; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: psllw %xmm2, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm2, %xmm3 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: psrlw %xmm3, %xmm1 -; X32-SSE-NEXT: por %xmm5, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm4 -; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pandn %xmm1, %xmm3 +; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat) @@ -1878,62 +1866,63 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: psubb %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psllw %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: psllw %xmm2, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm3, %xmm1 -; SSE2-NEXT: psrlw %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: psllw %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: psubb %xmm2, %xmm4 +; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm4, %xmm1 +; SSE2-NEXT: psrlw %xmm4, %xmm5 +; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psllw %xmm5, %xmm4 +; SSE41-NEXT: psllw %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE41-NEXT: psllw %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: psubb %xmm2, %xmm5 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm5, %xmm1 -; SSE41-NEXT: psrlw %xmm5, %xmm6 -; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm1, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm4 +; SSE41-NEXT: psllw %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm6 +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: psubb %xmm2, %xmm6 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm6, %xmm1 +; SSE41-NEXT: psrlw %xmm6, %xmm5 +; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm1, %xmm5 +; SSE41-NEXT: por %xmm5, %xmm4 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -1941,30 +1930,29 @@ ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm4 @@ -1972,6 +1960,7 @@ ; AVX2-NEXT: vpsllw %xmm3, %xmm5, %xmm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3 ; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1988,9 +1977,9 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2008,9 +1997,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2029,20 +2018,19 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -2051,20 +2039,19 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2072,20 +2059,19 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: vzeroupper @@ -2093,20 +2079,19 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: vzeroupper @@ -2114,9 +2099,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4 ; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 @@ -2127,8 +2112,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 @@ -2140,38 +2125,39 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE-NEXT: psubb %xmm2, %xmm3 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm4 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllw %xmm2, %xmm5 -; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6 -; X32-SSE-NEXT: psllw %xmm2, %xmm6 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; X32-SSE-NEXT: pand %xmm5, %xmm6 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm3, %xmm1 -; X32-SSE-NEXT: psrlw %xmm3, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: psllw %xmm3, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6 +; X32-SSE-NEXT: psllw %xmm3, %xmm6 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X32-SSE-NEXT: pand %xmm4, %xmm3 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X32-SSE-NEXT: psubb %xmm2, %xmm4 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm4, %xmm1 +; X32-SSE-NEXT: psrlw %xmm4, %xmm5 +; X32-SSE-NEXT: psrlw $8, %xmm5 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm4 +; X32-SSE-NEXT: por %xmm3, %xmm4 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pandn %xmm4, %xmm1 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat) Index: llvm/test/CodeGen/X86/vector-fshl-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-256.ll +++ llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -927,35 +927,31 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -968,30 +964,30 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -999,15 +995,15 @@ ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1015,30 +1011,30 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1051,35 +1047,31 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; XOPAVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpsrlq %xmm5, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1096,39 +1088,34 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpslld %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1142,34 +1129,34 @@ ; AVX512F-LABEL: splatvar_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -1177,17 +1164,17 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1195,34 +1182,34 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1235,39 +1222,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; XOPAVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; XOPAVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; XOPAVX1-NEXT: vpslld %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1285,30 +1267,25 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrlw %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1316,10 +1293,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1332,10 +1309,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1348,10 +1325,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1365,17 +1342,16 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1383,34 +1359,32 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1423,39 +1397,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; XOPAVX1-NEXT: vpsrlw %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; XOPAVX1-NEXT: vpsrlw %xmm5, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1473,41 +1442,33 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpsllw %xmm4, %xmm9, %xmm7 -; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm5, %xmm10, %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpsrlw %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsubb %xmm2, %xmm10, %xmm6 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-NEXT: vpcmpeqb %xmm8, %xmm5, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1515,14 +1476,14 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpsllw %xmm3, %ymm5, %ymm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1539,14 +1500,14 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1563,14 +1524,14 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm3 ; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1588,20 +1549,19 @@ ; AVX512BW-LABEL: splatvar_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1609,94 +1569,86 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; XOPAVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm5 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vorps %ymm1, %ymm4, %ymm1 +; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm6 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm6, %xmm4, %xmm7 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vorps %ymm1, %ymm5, %ymm1 -; XOPAVX1-NEXT: vpcomeqb %xmm8, %xmm4, %xmm3 -; XOPAVX1-NEXT: vpcomeqb %xmm8, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm4 Index: llvm/test/CodeGen/X86/vector-fshl-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-512.ll +++ llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -566,45 +566,45 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -617,15 +617,15 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -643,51 +643,51 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -700,17 +700,17 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -728,10 +728,10 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -748,10 +748,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -768,17 +768,16 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -791,17 +790,16 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -819,14 +817,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 ; AVX512F-NEXT: vpsllw %xmm5, %ymm9, %ymm8 ; AVX512F-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -849,14 +847,14 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm9, %ymm8 ; AVX512VL-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -879,100 +877,96 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512BW-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq Index: llvm/test/CodeGen/X86/vector-fshl-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -797,8 +797,8 @@ ; ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pslld %xmm2, %xmm3 @@ -811,8 +811,8 @@ ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -824,9 +824,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -899,9 +899,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -916,9 +916,9 @@ ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw %xmm2, %xmm3 @@ -931,11 +931,11 @@ ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -945,8 +945,8 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -958,8 +958,8 @@ ; ; AVX512-LABEL: splatvar_funnnel_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -984,9 +984,9 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -1006,9 +1006,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 @@ -1039,9 +1036,9 @@ ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pshufb %xmm3, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psllw %xmm4, %xmm2 @@ -1063,9 +1060,9 @@ ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 @@ -1084,8 +1081,8 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 @@ -1191,9 +1188,6 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X32-SSE-NEXT: psubb %xmm1, %xmm2 Index: llvm/test/CodeGen/X86/vector-fshl-rot-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -316,8 +316,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -332,8 +332,8 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -383,8 +383,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -409,8 +409,8 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 Index: llvm/test/CodeGen/X86/vector-fshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1227,10 +1227,10 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrlq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64] ; SSE2-NEXT: psubq %xmm2, %xmm4 ; SSE2-NEXT: psllq %xmm4, %xmm0 @@ -1248,10 +1248,10 @@ ; SSE41-LABEL: splatvar_funnnel_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrlq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64] ; SSE41-NEXT: psubq %xmm2, %xmm4 ; SSE41-NEXT: psllq %xmm4, %xmm3 @@ -1264,9 +1264,9 @@ ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1278,9 +1278,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1293,15 +1293,14 @@ ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1309,30 +1308,28 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1341,15 +1338,14 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1357,15 +1353,14 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1378,9 +1373,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1392,9 +1387,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1406,28 +1401,26 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE-NEXT: psrlq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; X32-SSE-NEXT: movdqa %xmm1, %xmm5 -; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] -; X32-SSE-NEXT: psubq %xmm2, %xmm3 +; X32-SSE-NEXT: movl $63, %eax +; X32-SSE-NEXT: movd %eax, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,0,3,2] +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [64,0,64,0] +; X32-SSE-NEXT: psubq %xmm4, %xmm5 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psllq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 +; X32-SSE-NEXT: psllq %xmm5, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; X32-SSE-NEXT: psllq %xmm5, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] -; X32-SSE-NEXT: pand %xmm3, %xmm2 +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: psrlq %xmm3, %xmm4 +; X32-SSE-NEXT: por %xmm0, %xmm4 ; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: pandn %xmm4, %xmm2 ; X32-SSE-NEXT: por %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE-NEXT: retl @@ -1439,8 +1432,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: xorps %xmm4, %xmm4 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1452,6 +1444,7 @@ ; SSE2-NEXT: movd %ecx, %xmm4 ; SSE2-NEXT: pslld %xmm4, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 @@ -1462,11 +1455,11 @@ ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrld %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32] ; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -1480,10 +1473,10 @@ ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1496,11 +1489,11 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1514,17 +1507,17 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1532,34 +1525,33 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1568,17 +1560,17 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1586,17 +1578,16 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1609,10 +1600,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1625,11 +1616,11 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1642,8 +1633,7 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 ; X32-SSE-NEXT: xorps %xmm4, %xmm4 ; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1655,6 +1645,7 @@ ; X32-SSE-NEXT: movd %ecx, %xmm4 ; X32-SSE-NEXT: pslld %xmm4, %xmm0 ; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: pandn %xmm0, %xmm2 @@ -1669,21 +1660,21 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm3, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlw %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm3, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm3, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw %xmm3, %xmm5 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 @@ -1693,12 +1684,12 @@ ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrlw %xmm0, %xmm4 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16] ; SSE41-NEXT: psubw %xmm2, %xmm0 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -1712,11 +1703,11 @@ ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1729,10 +1720,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1745,10 +1736,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1761,10 +1752,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1778,17 +1769,16 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1797,17 +1787,16 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1815,17 +1804,16 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1838,11 +1826,11 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1855,10 +1843,10 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1871,21 +1859,21 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm3, %xmm4 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: psrlw %xmm2, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm3, %xmm2 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm1, %xmm5 -; X32-SSE-NEXT: psrlw %xmm3, %xmm5 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: pandn %xmm0, %xmm2 ; X32-SSE-NEXT: por %xmm1, %xmm2 @@ -1899,63 +1887,63 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: psubb %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlw %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE2-NEXT: psrlw %xmm3, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: psubb %xmm2, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psllw %xmm4, %xmm0 -; SSE2-NEXT: psllw %xmm4, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psllw %xmm4, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: psrlw %xmm4, %xmm5 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrlw %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE41-NEXT: psrlw %xmm4, %xmm7 -; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm5, %xmm7 +; SSE41-NEXT: psrlw %xmm0, %xmm6 +; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE41-NEXT: psubb %xmm2, %xmm4 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psllw %xmm4, %xmm3 -; SSE41-NEXT: psllw %xmm4, %xmm6 -; SSE41-NEXT: pshufb %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: psllw %xmm4, %xmm5 +; SSE41-NEXT: pshufb %xmm0, %xmm5 +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: por %xmm6, %xmm3 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -1963,30 +1951,29 @@ ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4 @@ -1995,6 +1982,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3 ; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -2010,9 +1998,9 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2030,9 +2018,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2051,20 +2039,19 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2073,20 +2060,19 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2094,49 +2080,47 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2149,8 +2133,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4 @@ -2164,39 +2148,39 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE-NEXT: psubb %xmm3, %xmm4 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm1, %xmm5 -; X32-SSE-NEXT: psrlw %xmm3, %xmm5 +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: psrlw %xmm3, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 ; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6 ; X32-SSE-NEXT: psrlw %xmm3, %xmm6 -; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm6 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; X32-SSE-NEXT: pand %xmm5, %xmm6 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X32-SSE-NEXT: pand %xmm4, %xmm3 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X32-SSE-NEXT: psubb %xmm2, %xmm4 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: psllw %xmm4, %xmm0 -; X32-SSE-NEXT: psllw %xmm4, %xmm3 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; X32-SSE-NEXT: pand %xmm0, %xmm3 -; X32-SSE-NEXT: por %xmm6, %xmm3 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm3, %xmm2 -; X32-SSE-NEXT: por %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE-NEXT: psllw %xmm4, %xmm5 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-SSE-NEXT: pand %xmm0, %xmm4 +; X32-SSE-NEXT: por %xmm3, %xmm4 +; X32-SSE-NEXT: pxor %xmm0, %xmm0 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat) Index: llvm/test/CodeGen/X86/vector-fshr-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-256.ll +++ llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -931,35 +931,31 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsllq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -972,45 +968,45 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1018,30 +1014,30 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1054,35 +1050,31 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm4 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpsllq %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -1099,39 +1091,34 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpsrld %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpslld %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-NEXT: vpslld %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpslld %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1145,51 +1132,51 @@ ; AVX512F-LABEL: splatvar_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1197,34 +1184,34 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1237,39 +1224,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsrld %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpslld %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; XOPAVX1-NEXT: vpslld %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpslld %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1287,30 +1269,25 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -1318,10 +1295,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1334,10 +1311,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1350,10 +1327,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1367,17 +1344,16 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1385,34 +1361,32 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1425,39 +1399,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; XOPAVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1475,40 +1444,33 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpsrlw %xmm4, %xmm8, %xmm7 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm5, %xmm7, %xmm6 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsllw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsubb %xmm2, %xmm7, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm6, %xmm6 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpsllw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7 ; AVX1-NEXT: vpsllw %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm9, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -1516,8 +1478,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -1525,6 +1486,7 @@ ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1540,8 +1502,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -1549,6 +1510,7 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1564,8 +1526,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -1573,6 +1534,7 @@ ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1589,20 +1551,19 @@ ; AVX512BW-LABEL: splatvar_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1610,93 +1571,85 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm5 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm6 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm4, %xmm8, %xmm7 +; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm7, %xmm6, %xmm6 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm8, %xmm7 -; XOPAVX1-NEXT: vpshlb %xmm7, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm5, %ymm0, %ymm0 -; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm4, %xmm4 +; XOPAVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 ; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3 Index: llvm/test/CodeGen/X86/vector-fshr-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-512.ll +++ llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -558,43 +558,43 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -607,15 +607,15 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -633,49 +633,49 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -688,17 +688,17 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -716,10 +716,10 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -736,10 +736,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -756,17 +756,16 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -779,17 +778,16 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -807,8 +805,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 ; AVX512F-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 @@ -816,6 +813,7 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm8, %ymm8 ; AVX512F-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -837,8 +835,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 @@ -846,6 +843,7 @@ ; AVX512VL-NEXT: vpsrlw $8, %ymm8, %ymm8 ; AVX512VL-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -867,97 +865,93 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512BW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512BW-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer Index: llvm/test/CodeGen/X86/vector-fshr-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -785,17 +785,17 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -843,26 +843,26 @@ ; ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: psubd %xmm1, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pslld %xmm1, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32] -; SSE41-NEXT: psubd %xmm2, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pslld %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero ; SSE41-NEXT: psrld %xmm1, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -874,11 +874,11 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -920,17 +920,17 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -956,49 +956,49 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllw %xmm2, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm1, %xmm3 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: psubw %xmm1, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw %xmm1, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; SSE41-NEXT: psubw %xmm2, %xmm1 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE41-NEXT: psubw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -1008,10 +1008,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -1023,10 +1023,10 @@ ; ; AVX512-LABEL: splatvar_funnnel_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -1038,37 +1038,37 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm2, %xmm1 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psllw %xmm2, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psllw %xmm1, %xmm3 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm2, %xmm0 ; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer @@ -1079,9 +1079,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 @@ -1114,10 +1111,10 @@ ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubb %xmm1, %xmm3 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: pshufb %xmm2, %xmm3 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllw %xmm4, %xmm1 @@ -1140,9 +1137,9 @@ ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 @@ -1161,10 +1158,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 @@ -1258,24 +1255,21 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: psubb %xmm1, %xmm2 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 Index: llvm/test/CodeGen/X86/vector-fshr-rot-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -628,9 +628,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 @@ -639,9 +639,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2 @@ -656,9 +656,9 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -676,9 +676,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero @@ -720,9 +720,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 @@ -731,9 +731,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2 @@ -748,10 +748,10 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -769,9 +769,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -784,9 +784,9 @@ ; ; AVX512-LABEL: splatvar_funnnel_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -799,10 +799,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -811,9 +811,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2 @@ -829,8 +829,8 @@ ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 @@ -857,9 +857,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -880,9 +880,9 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -903,9 +903,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -961,8 +961,8 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 @@ -971,9 +971,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2 Index: llvm/test/CodeGen/X86/vector-fshr-rot-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -328,10 +328,10 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpsubw %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -346,10 +346,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm3, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -399,10 +399,10 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -427,10 +427,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 Index: llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll @@ -1160,11 +1160,9 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1208,9 +1206,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1265,9 +1261,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1335,9 +1329,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1430,9 +1422,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1543,9 +1533,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 Index: llvm/test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -727,25 +727,23 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: retq ; @@ -753,18 +751,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: retq @@ -772,29 +770,29 @@ ; AVX1-LABEL: test_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -802,10 +800,9 @@ ; AVX512VL-LABEL: test_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> %a0) @@ -1209,15 +1206,12 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1226,9 +1220,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq @@ -1237,18 +1231,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax @@ -1257,11 +1251,10 @@ ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1269,10 +1262,9 @@ ; AVX512BW-LABEL: test_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1281,10 +1273,9 @@ ; AVX512VL-LABEL: test_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: retq @@ -1304,17 +1295,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1325,10 +1313,9 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7] ; SSE41-NEXT: pmaxud %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE41-NEXT: pmaxud %xmm2, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq @@ -1340,10 +1327,9 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1355,10 +1341,9 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1380,9 +1365,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1450,9 +1433,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1545,9 +1526,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1658,9 +1637,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1762,16 +1739,12 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1780,9 +1753,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -1790,19 +1763,19 @@ ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: por %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pextrb $0, %xmm2, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1810,21 +1783,19 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1832,10 +1803,9 @@ ; ; AVX512VL-LABEL: test_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: retq @@ -1855,18 +1825,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1877,11 +1843,10 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pmaxud %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxud %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1891,10 +1856,9 @@ ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1905,10 +1869,9 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -1919,10 +1882,9 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1942,14 +1904,11 @@ ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1964,11 +1923,11 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pmaxuw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1981,10 +1940,10 @@ ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1998,10 +1957,10 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -2015,10 +1974,10 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll +++ llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll @@ -1159,11 +1159,9 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1207,9 +1205,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1264,9 +1260,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1315,9 +1309,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1387,9 +1379,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1475,9 +1465,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 Index: llvm/test/CodeGen/X86/vector-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -726,14 +726,12 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -742,9 +740,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: retq ; @@ -752,18 +750,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: retq @@ -771,29 +769,29 @@ ; AVX1-LABEL: test_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpminuq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -801,10 +799,9 @@ ; AVX512VL-LABEL: test_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> %a0) @@ -1208,15 +1205,12 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1225,9 +1219,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq @@ -1236,18 +1230,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax @@ -1256,11 +1250,10 @@ ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1268,10 +1261,9 @@ ; AVX512BW-LABEL: test_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1280,10 +1272,9 @@ ; AVX512VL-LABEL: test_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: retq @@ -1303,17 +1294,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1324,10 +1312,9 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7] ; SSE41-NEXT: pminud %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq @@ -1339,10 +1326,9 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1354,10 +1340,9 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1379,9 +1364,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1430,9 +1413,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1502,9 +1483,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1590,9 +1569,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1668,16 +1645,12 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1686,9 +1659,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -1696,19 +1669,19 @@ ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pextrb $0, %xmm2, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1716,21 +1689,19 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1738,10 +1709,9 @@ ; ; AVX512VL-LABEL: test_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: retq @@ -1761,18 +1731,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1783,11 +1749,10 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1797,10 +1762,9 @@ ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1811,10 +1775,9 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -1825,10 +1788,9 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1848,14 +1810,11 @@ ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1870,11 +1829,11 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pminuw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1887,10 +1846,10 @@ ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1904,10 +1863,10 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -1921,10 +1880,10 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rotate-128.ll +++ llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -752,8 +752,8 @@ ; ; SSE41-LABEL: splatvar_rotate_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pslld %xmm2, %xmm3 @@ -766,8 +766,8 @@ ; ; AVX1-LABEL: splatvar_rotate_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -779,9 +779,9 @@ ; ; AVX2-LABEL: splatvar_rotate_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -857,9 +857,9 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_rotate_v8i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -874,9 +874,9 @@ ; ; SSE41-LABEL: splatvar_rotate_v8i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw %xmm2, %xmm3 @@ -889,11 +889,11 @@ ; ; AVX1-LABEL: splatvar_rotate_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -903,8 +903,8 @@ ; ; AVX2-LABEL: splatvar_rotate_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -916,8 +916,8 @@ ; ; AVX512-LABEL: splatvar_rotate_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -942,9 +942,9 @@ ; ; X32-SSE-LABEL: splatvar_rotate_v8i16: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -967,9 +967,6 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_rotate_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 @@ -1000,9 +997,9 @@ ; ; SSE41-LABEL: splatvar_rotate_v16i8: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pshufb %xmm3, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psllw %xmm4, %xmm2 @@ -1024,9 +1021,9 @@ ; ; AVX1-LABEL: splatvar_rotate_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 @@ -1045,8 +1042,8 @@ ; ; AVX2-LABEL: splatvar_rotate_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 @@ -1140,9 +1137,6 @@ ; ; X32-SSE-LABEL: splatvar_rotate_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X32-SSE-NEXT: psubb %xmm1, %xmm2 Index: llvm/test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -645,17 +645,17 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -848,16 +848,16 @@ ; XOPAVX1-LABEL: splatvar_shift_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -1368,18 +1368,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v8i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v8i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1510,18 +1510,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1652,9 +1652,9 @@ ; ; XOP-LABEL: splatvar_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -930,18 +930,19 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrlq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: psrlq %xmm3, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlq %xmm0, %xmm4 +; SSE2-NEXT: psrlq %xmm3, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE2-NEXT: xorpd %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 @@ -996,7 +997,7 @@ ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 @@ -1021,7 +1022,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 @@ -1032,7 +1033,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 @@ -1044,7 +1045,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 @@ -1083,8 +1084,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad %xmm1, %xmm3 @@ -1107,44 +1109,42 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pslld $16, %xmm0 ; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrad %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: retq ; @@ -1152,9 +1152,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1162,9 +1161,9 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq @@ -1173,9 +1172,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1183,9 +1181,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1193,9 +1190,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -1203,8 +1200,9 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pslld $16, %xmm0 ; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psrad %xmm1, %xmm3 @@ -1307,7 +1305,7 @@ ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 @@ -1333,7 +1331,7 @@ ; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 @@ -1343,7 +1341,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 @@ -1355,7 +1353,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vector-shift-lshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -699,16 +699,16 @@ ; XOPAVX1-LABEL: splatvar_shift_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll @@ -1120,18 +1120,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v8i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v8i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1242,18 +1242,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1364,9 +1364,9 @@ ; ; XOP-LABEL: splatvar_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -775,10 +775,10 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrlq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -815,7 +815,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -834,7 +834,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -843,7 +843,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -852,7 +852,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -877,18 +877,18 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm1, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld %xmm1, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -902,52 +902,52 @@ ; SSE41-LABEL: splatvar_shift_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld %xmm0, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrld %xmm0, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; AVX1-NEXT: vpsrld %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -955,8 +955,8 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq @@ -965,8 +965,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -974,8 +974,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -983,25 +983,25 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld %xmm1, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrld %xmm1, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrld %xmm4, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrld %xmm3, %xmm4 @@ -1059,7 +1059,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1078,7 +1078,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -1087,7 +1087,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1096,7 +1096,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1210,11 +1210,10 @@ ; ; AVX2-LABEL: splatvar_shift_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -1222,32 +1221,21 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: splatvar_shift_v8i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v8i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1256,8 +1244,7 @@ ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1266,11 +1253,10 @@ ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1278,8 +1264,7 @@ ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq @@ -1373,8 +1358,7 @@ ; ; AVX1-LABEL: splatvar_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 @@ -1392,42 +1376,37 @@ ; ; AVX2-LABEL: splatvar_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1487,9 +1466,7 @@ ; ; AVX1-LABEL: splatvar_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -1499,43 +1476,37 @@ ; ; AVX2-LABEL: splatvar_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v2i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; XOPAVX1-NEXT: # xmm2 = mem[0,0] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v2i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq Index: llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -564,8 +564,9 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psllq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -598,7 +599,7 @@ ; ; AVX2-LABEL: splatvar_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -614,7 +615,7 @@ ; ; XOPAVX2-LABEL: splatvar_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -622,7 +623,7 @@ ; ; AVX512-LABEL: splatvar_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -630,7 +631,7 @@ ; ; AVX512VL-LABEL: splatvar_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -688,41 +689,37 @@ ; ; AVX2-LABEL: splatvar_shift_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -782,7 +779,7 @@ ; ; AVX2-LABEL: splatvar_shift_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -798,7 +795,7 @@ ; ; XOPAVX2-LABEL: splatvar_shift_v2i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -806,7 +803,7 @@ ; ; AVX512-LABEL: splatvar_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -814,7 +811,7 @@ ; ; AVX512VL-LABEL: splatvar_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0