Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18827,6 +18827,56 @@ SDValue UndefV = LHS.getOperand(1); return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); } + + SDLoc DL(N); + SDValue UndefV = DAG.getUNDEF(VT); + auto isMaskSafeForReorderedShuffle = [&](bool IsShufOp0) { + // If the shuffle mask has no undefined elements, the mask is safe. + ShuffleVectorSDNode *Shuf = IsShufOp0 ? Shuf0 : Shuf1; + if (all_of(Shuf->getMask(), [](int M) { return M != -1; })) + return true; + + // If the binop would create undefined elements for the lanes where the + // shuffle does too, the mask is safe. If the mask has undefined elements, + // but the binop would not produce undefined elements, it is not safe (the + // new shuffle could produce undefined elements where the original code + // did not). + // TODO: We should have a query API that can answer whether some operation + // on undef/constant values produces undef/constant without actually + // creating constant nodes. + return IsShufOp0 ? DAG.getNode(Opcode, DL, VT, UndefV, RHS).isUndef() + : DAG.getNode(Opcode, DL, VT, LHS, UndefV).isUndef(); + }; + + // Move a splat after a binop to increase shuffle combining and demanded + // elements opportunities. + unsigned NumElts = VT.getVectorNumElements(); + if (Shuf0 && Shuf0->isSplat() && Shuf0->hasOneUse() && + (isConstOrConstSplat(RHS) || isConstOrConstSplatFP(RHS))) { + // bo (splat X), SplatC --> splat (bo X, SplatC) + SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0), RHS, + N->getFlags()); + // Re-use the existing shuffle mask if possible to preserve undef lanes. + if (isMaskSafeForReorderedShuffle(true)) + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask()); + + // We can't preserve undef lanes, so create a "full" splat. + SmallVector SplatMask(NumElts, Shuf0->getSplatIndex()); + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, SplatMask); + } + if (Shuf1 && Shuf1->isSplat() && Shuf1->hasOneUse() && + (isConstOrConstSplat(LHS) || isConstOrConstSplatFP(LHS))) { + // bo SplatC, (splat X) --> splat (bo SplatC, X) + SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, RHS.getOperand(0), + N->getFlags()); + // Re-use the existing shuffle mask if possible to preserve undef lanes. + if (isMaskSafeForReorderedShuffle(false)) + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf1->getMask()); + + // We can't preserve undef lanes, so create a "full" splat. + SmallVector SplatMask(NumElts, Shuf1->getSplatIndex()); + return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, SplatMask); + } } // The following pattern is likely to emerge with vector reduction ops. Moving Index: llvm/test/CodeGen/ARM/reg_sequence.ll =================================================================== --- llvm/test/CodeGen/ARM/reg_sequence.ll +++ llvm/test/CodeGen/ARM/reg_sequence.ll @@ -276,11 +276,12 @@ ; CHECK: vmul.f32 [[Q8:q[0-9]+]], [[Q0]], [[Q0]] ; CHECK-NEXT: vadd.f32 [[Q8]], [[Q8]], [[Q8]] ; CHECK-NEXT: vadd.f32 [[Q1:q[0-9]+]], [[Q8]], [[Q8]] -; CHECK-NEXT: vmul.f32 [[Q8]], [[Q9]], d1[0] +; CHECK-NEXT: vext.32 [[Q8]], [[Q8]], [[Q8]], #2 +; CHECK-NEXT: vmul.f32 [[Q8]], [[Q8]], [[Q9]] ; CHECK-NEXT: vmul.f32 [[Q8]], [[Q8]], [[Q8]] ; CHECK-NEXT: vadd.f32 [[Q8]], [[Q8]], [[Q8]] ; CHECK-NEXT: vmul.f32 [[Q8]], [[Q8]], [[Q8]] -; CHECK-NEXT: vst1.32 {d17[1]}, [r0:32] +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movne r0, #0 Index: llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -9653,7 +9653,8 @@ ; ; X64-LABEL: test_mask_mul_epi32_rmb_128: ; X64: # %bb.0: -; X64-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x0f] +; X64-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 @@ -9676,8 +9677,9 @@ ; ; X64-LABEL: test_mask_mul_epi32_rmbk_128: ; X64: # %bb.0: +; X64-NEXT: vpbroadcastd (%rdi), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x17] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f] +; X64-NEXT: vpmuldq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x28,0xca] ; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load i64, i64* %ptr_b @@ -9700,8 +9702,9 @@ ; ; X64-LABEL: test_mask_mul_epi32_rmbkz_128: ; X64: # %bb.0: +; X64-NEXT: vpbroadcastd (%rdi), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x0f] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07] +; X64-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load i64, i64* %ptr_b %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0 Index: llvm/test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -249,9 +249,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -291,9 +289,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -855,9 +851,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -918,9 +912,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1699,9 +1691,7 @@ ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1772,9 +1762,7 @@ ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 Index: llvm/test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -251,9 +251,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -287,9 +285,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -795,9 +791,7 @@ ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -849,9 +843,7 @@ ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1603,9 +1595,7 @@ ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1667,9 +1657,7 @@ ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: xorl $32768, %eax ## imm = 0x8000 Index: llvm/test/CodeGen/X86/scalarize-fp.ll =================================================================== --- llvm/test/CodeGen/X86/scalarize-fp.ll +++ llvm/test/CodeGen/X86/scalarize-fp.ll @@ -507,14 +507,14 @@ define <2 x double> @fadd_splat_const_op1_v2f64(<2 x double> %vx) { ; SSE-LABEL: fadd_splat_const_op1_v2f64: ; SSE: # %bb.0: +; SSE-NEXT: addsd {{.*}}(%rip), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] -; SSE-NEXT: addpd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fadd_splat_const_op1_v2f64: ; AVX: # %bb.0: +; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> zeroinitializer %r = fadd <2 x double> %splatx, @@ -548,14 +548,14 @@ define <4 x float> @fmul_splat_const_op1_v4f32(<4 x float> %vx, <4 x float> %vy) { ; SSE-LABEL: fmul_splat_const_op1_v4f32: ; SSE: # %bb.0: +; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fmul_splat_const_op1_v4f32: ; AVX: # %bb.0: +; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer %r = fmul fast <4 x float> %splatx, @@ -565,28 +565,18 @@ define <8 x float> @fdiv_splat_const_op0_v8f32(<8 x float> %vy) { ; SSE-LABEL: fdiv_splat_const_op0_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_splat_const_op0_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vrcpps %ymm0, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> , <8 x float> undef, <8 x i32> zeroinitializer %splaty = shufflevector <8 x float> %vy, <8 x float> undef, <8 x i32> zeroinitializer @@ -597,22 +587,18 @@ define <8 x float> @fdiv_const_op1_splat_v8f32(<8 x float> %vx) { ; SSE-LABEL: fdiv_const_op1_splat_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: rcpps %xmm1, %xmm1 -; SSE-NEXT: addps %xmm1, %xmm1 -; SSE-NEXT: mulps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: fdiv_const_op1_splat_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vrcpps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer %splaty = shufflevector <8 x float> , <8 x float> undef, <8 x i32> zeroinitializer Index: llvm/test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1186,10 +1186,10 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64] ; SSE2-NEXT: psubq %xmm2, %xmm4 ; SSE2-NEXT: psrlq %xmm4, %xmm1 @@ -1206,25 +1206,25 @@ ; SSE41-LABEL: splatvar_funnnel_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllq %xmm4, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psllq %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64] -; SSE41-NEXT: psubq %xmm4, %xmm0 +; SSE41-NEXT: psubq %xmm2, %xmm0 ; SSE41-NEXT: psrlq %xmm0, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1236,9 +1236,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1251,9 +1251,9 @@ ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1266,9 +1266,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1281,9 +1281,9 @@ ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1297,9 +1297,9 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1312,9 +1312,9 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1332,9 +1332,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1346,9 +1346,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1360,29 +1360,27 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm4, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] -; X32-SSE-NEXT: psubq %xmm2, %xmm3 -; X32-SSE-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE-NEXT: psrlq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] -; X32-SSE-NEXT: pand %xmm3, %xmm2 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: movl $63, %eax +; X32-SSE-NEXT: movd %eax, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; X32-SSE-NEXT: pxor %xmm4, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] +; X32-SSE-NEXT: pand %xmm4, %xmm5 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,0,64,0] +; X32-SSE-NEXT: psubq %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa %xmm1, %xmm2 +; X32-SSE-NEXT: psrlq %xmm4, %xmm2 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psllq %xmm3, %xmm2 +; X32-SSE-NEXT: por %xmm1, %xmm2 +; X32-SSE-NEXT: pand %xmm5, %xmm0 +; X32-SSE-NEXT: pandn %xmm2, %xmm5 +; X32-SSE-NEXT: por %xmm5, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) @@ -1392,8 +1390,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: xorps %xmm4, %xmm4 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1405,6 +1402,7 @@ ; SSE2-NEXT: movd %ecx, %xmm4 ; SSE2-NEXT: psrld %xmm4, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm2 @@ -1414,28 +1412,28 @@ ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pslld %xmm0, %xmm2 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pslld %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32] -; SSE41-NEXT: psubd %xmm4, %xmm0 +; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: psrld %xmm0, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1448,11 +1446,11 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1466,11 +1464,11 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1484,10 +1482,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VL-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1501,11 +1499,11 @@ ; AVX512BW-LABEL: splatvar_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512BW-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1520,11 +1518,11 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VBMI2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1538,10 +1536,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VLBW-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1560,10 +1558,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1576,11 +1574,11 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1593,8 +1591,7 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 ; X32-SSE-NEXT: xorps %xmm4, %xmm4 ; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1606,6 +1603,7 @@ ; X32-SSE-NEXT: movd %ecx, %xmm4 ; X32-SSE-NEXT: psrld %xmm4, %xmm1 ; X32-SSE-NEXT: por %xmm5, %xmm1 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X32-SSE-NEXT: pand %xmm2, %xmm0 ; X32-SSE-NEXT: pandn %xmm1, %xmm2 @@ -1619,53 +1617,53 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psllw %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm2, %xmm3 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrlw %xmm3, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpeqw %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psllw %xmm0, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psllw %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16] -; SSE41-NEXT: psubw %xmm4, %xmm0 +; SSE41-NEXT: psubw %xmm2, %xmm0 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE41-NEXT: psrlw %xmm0, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: por %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqw %xmm4, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1678,10 +1676,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1694,10 +1692,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1710,10 +1708,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1727,10 +1725,10 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1745,10 +1743,10 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1762,10 +1760,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1784,11 +1782,11 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1801,10 +1799,10 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1817,24 +1815,24 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm2, %xmm3 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm4 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllw %xmm2, %xmm5 +; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: psllw %xmm2, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm2, %xmm3 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: psrlw %xmm3, %xmm1 -; X32-SSE-NEXT: por %xmm5, %xmm1 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm4 -; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm1 +; X32-SSE-NEXT: pxor %xmm3, %xmm3 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3 +; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pandn %xmm1, %xmm3 +; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat) @@ -1844,62 +1842,63 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: psubb %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psllw %xmm2, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: psllw %xmm2, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm3, %xmm1 -; SSE2-NEXT: psrlw %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psllw %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: psllw %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: psubb %xmm2, %xmm4 +; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm4, %xmm1 +; SSE2-NEXT: psrlw %xmm4, %xmm5 +; SSE2-NEXT: psrlw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psllw %xmm5, %xmm4 +; SSE41-NEXT: psllw %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE41-NEXT: psllw %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE41-NEXT: psubb %xmm2, %xmm5 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm5, %xmm1 -; SSE41-NEXT: psrlw %xmm5, %xmm6 -; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm1, %xmm6 -; SSE41-NEXT: por %xmm6, %xmm4 +; SSE41-NEXT: psllw %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm6 +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufb %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: psubb %xmm2, %xmm6 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlw %xmm6, %xmm1 +; SSE41-NEXT: psrlw %xmm6, %xmm5 +; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm1, %xmm5 +; SSE41-NEXT: por %xmm5, %xmm4 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -1907,30 +1906,29 @@ ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %xmm0, %xmm4 @@ -1938,6 +1936,7 @@ ; AVX2-NEXT: vpsllw %xmm3, %xmm5, %xmm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3 ; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1954,9 +1953,9 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1974,9 +1973,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1995,9 +1994,9 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero @@ -2016,9 +2015,9 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero @@ -2036,9 +2035,9 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2056,9 +2055,9 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2076,9 +2075,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4 ; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 @@ -2089,8 +2088,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 @@ -2102,38 +2101,39 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE-NEXT: psubb %xmm2, %xmm3 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm4 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllw %xmm2, %xmm5 -; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6 -; X32-SSE-NEXT: psllw %xmm2, %xmm6 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; X32-SSE-NEXT: pand %xmm5, %xmm6 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm3, %xmm1 -; X32-SSE-NEXT: psrlw %xmm3, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 +; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: psllw %xmm3, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6 +; X32-SSE-NEXT: psllw %xmm3, %xmm6 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X32-SSE-NEXT: pand %xmm4, %xmm3 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: por %xmm4, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X32-SSE-NEXT: psubb %xmm2, %xmm4 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm4, %xmm1 +; X32-SSE-NEXT: psrlw %xmm4, %xmm5 +; X32-SSE-NEXT: psrlw $8, %xmm5 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-SSE-NEXT: pand %xmm1, %xmm4 +; X32-SSE-NEXT: por %xmm3, %xmm4 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pandn %xmm4, %xmm1 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat) Index: llvm/test/CodeGen/X86/vector-fshl-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-256.ll +++ llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -906,35 +906,31 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -947,10 +943,10 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -962,9 +958,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -977,10 +974,10 @@ ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -993,10 +990,10 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1008,9 +1005,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1028,35 +1026,31 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; XOPAVX1-NEXT: vpsrlq %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpsrlq %xmm5, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1073,39 +1067,34 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpslld %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1119,11 +1108,11 @@ ; AVX512F-LABEL: splatvar_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1136,10 +1125,11 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VL-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1153,11 +1143,11 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512BW-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1171,11 +1161,11 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VBMI2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1188,10 +1178,11 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VLBW-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1210,39 +1201,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; XOPAVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; XOPAVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; XOPAVX1-NEXT: vpslld %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1260,30 +1246,25 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrlw %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1291,10 +1272,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1307,10 +1288,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1323,10 +1304,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1340,10 +1321,10 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1357,10 +1338,10 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1373,10 +1354,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1395,39 +1376,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; XOPAVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; XOPAVX1-NEXT: vpsrlw %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; XOPAVX1-NEXT: vpsrlw %xmm5, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1445,41 +1421,33 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpsllw %xmm4, %xmm9, %xmm7 -; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm5, %xmm10, %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpsrlw %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsubb %xmm2, %xmm10, %xmm6 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-NEXT: vpcmpeqb %xmm8, %xmm5, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm8, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1487,14 +1455,14 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpsllw %xmm3, %ymm5, %ymm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1511,14 +1479,14 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1535,14 +1503,14 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm3 ; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1560,9 +1528,9 @@ ; AVX512BW-LABEL: splatvar_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1580,9 +1548,9 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1599,9 +1567,9 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1618,9 +1586,9 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1637,34 +1605,29 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; XOPAVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm5 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; XOPAVX1-NEXT: vorps %ymm1, %ymm4, %ymm1 +; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm6 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm6, %xmm4, %xmm7 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; XOPAVX1-NEXT: vorps %ymm1, %ymm5, %ymm1 -; XOPAVX1-NEXT: vpcomeqb %xmm8, %xmm4, %xmm3 -; XOPAVX1-NEXT: vpcomeqb %xmm8, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm4 Index: llvm/test/CodeGen/X86/vector-fshl-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-512.ll +++ llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -552,9 +552,10 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -566,9 +567,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -580,9 +582,10 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -600,9 +603,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -625,10 +629,11 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -641,10 +646,11 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -657,10 +663,11 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -679,10 +686,11 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -706,10 +714,10 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -726,10 +734,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -746,10 +754,10 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -768,10 +776,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -795,14 +803,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 ; AVX512F-NEXT: vpsllw %xmm5, %ymm9, %ymm8 ; AVX512F-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -825,14 +833,14 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm9, %ymm8 ; AVX512VL-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -855,14 +863,14 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4 ; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -879,14 +887,14 @@ ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 ; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -903,14 +911,14 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -927,14 +935,14 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 ; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero Index: llvm/test/CodeGen/X86/vector-fshl-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -797,8 +797,8 @@ ; ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pslld %xmm2, %xmm3 @@ -811,8 +811,8 @@ ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -824,9 +824,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -899,9 +899,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -916,9 +916,9 @@ ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw %xmm2, %xmm3 @@ -931,11 +931,11 @@ ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -945,8 +945,8 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -958,8 +958,8 @@ ; ; AVX512-LABEL: splatvar_funnnel_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -984,9 +984,9 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -1006,9 +1006,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 @@ -1039,9 +1036,9 @@ ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pshufb %xmm3, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psllw %xmm4, %xmm2 @@ -1063,9 +1060,9 @@ ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 @@ -1084,8 +1081,8 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 @@ -1191,9 +1188,6 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X32-SSE-NEXT: psubb %xmm1, %xmm2 Index: llvm/test/CodeGen/X86/vector-fshl-rot-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -316,8 +316,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -332,8 +332,8 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -383,8 +383,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -409,8 +409,8 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 Index: llvm/test/CodeGen/X86/vector-fshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1203,10 +1203,10 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrlq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64] ; SSE2-NEXT: psubq %xmm2, %xmm4 ; SSE2-NEXT: psllq %xmm4, %xmm0 @@ -1224,10 +1224,10 @@ ; SSE41-LABEL: splatvar_funnnel_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrlq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64] ; SSE41-NEXT: psubq %xmm2, %xmm4 ; SSE41-NEXT: psllq %xmm4, %xmm3 @@ -1240,9 +1240,9 @@ ; ; AVX1-LABEL: splatvar_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1254,9 +1254,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1269,9 +1269,9 @@ ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1284,9 +1284,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1298,9 +1298,9 @@ ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1314,9 +1314,9 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1329,9 +1329,9 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1349,9 +1349,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1363,9 +1363,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 @@ -1377,28 +1377,26 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa %xmm1, %xmm3 -; X32-SSE-NEXT: psrlq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; X32-SSE-NEXT: movdqa %xmm1, %xmm5 -; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] -; X32-SSE-NEXT: psubq %xmm2, %xmm3 +; X32-SSE-NEXT: movl $63, %eax +; X32-SSE-NEXT: movd %eax, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,0,3,2] +; X32-SSE-NEXT: pand %xmm5, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [64,0,64,0] +; X32-SSE-NEXT: psubq %xmm4, %xmm5 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psllq %xmm3, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 +; X32-SSE-NEXT: psllq %xmm5, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; X32-SSE-NEXT: psllq %xmm5, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] -; X32-SSE-NEXT: pand %xmm3, %xmm2 +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: psrlq %xmm3, %xmm4 +; X32-SSE-NEXT: por %xmm0, %xmm4 ; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: pandn %xmm4, %xmm2 ; X32-SSE-NEXT: por %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE-NEXT: retl @@ -1410,8 +1408,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: xorps %xmm4, %xmm4 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1423,6 +1420,7 @@ ; SSE2-NEXT: movd %ecx, %xmm4 ; SSE2-NEXT: pslld %xmm4, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 @@ -1433,11 +1431,11 @@ ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrld %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32] ; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -1451,10 +1449,10 @@ ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1467,11 +1465,11 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1485,11 +1483,11 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1503,10 +1501,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1519,11 +1517,11 @@ ; AVX512BW-LABEL: splatvar_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1538,11 +1536,11 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1556,10 +1554,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1578,10 +1576,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1594,11 +1592,11 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1611,8 +1609,7 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm2 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 ; X32-SSE-NEXT: xorps %xmm4, %xmm4 ; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] @@ -1624,6 +1621,7 @@ ; X32-SSE-NEXT: movd %ecx, %xmm4 ; X32-SSE-NEXT: pslld %xmm4, %xmm0 ; X32-SSE-NEXT: por %xmm5, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: pandn %xmm0, %xmm2 @@ -1638,21 +1636,21 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm3, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlw %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm3, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psllw %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm3, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw %xmm3, %xmm5 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 @@ -1662,12 +1660,12 @@ ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psrlw %xmm0, %xmm4 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16] ; SSE41-NEXT: psubw %xmm2, %xmm0 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero @@ -1681,11 +1679,11 @@ ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1698,10 +1696,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1714,10 +1712,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1730,10 +1728,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1747,10 +1745,10 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1765,10 +1763,10 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1782,10 +1780,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1804,11 +1802,11 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1821,10 +1819,10 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1837,21 +1835,21 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm3, %xmm4 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: psrlw %xmm2, %xmm4 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm3, %xmm2 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psllw %xmm2, %xmm0 +; X32-SSE-NEXT: por %xmm4, %xmm0 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm1, %xmm5 -; X32-SSE-NEXT: psrlw %xmm3, %xmm5 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: pandn %xmm0, %xmm2 ; X32-SSE-NEXT: por %xmm1, %xmm2 @@ -1865,63 +1863,63 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE2-NEXT: psubb %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlw %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlw %xmm3, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE2-NEXT: psrlw %xmm3, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: psubb %xmm2, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psllw %xmm4, %xmm0 -; SSE2-NEXT: psllw %xmm4, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psllw %xmm4, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: psrlw %xmm4, %xmm5 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrlw %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE41-NEXT: psrlw %xmm4, %xmm7 -; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm5, %xmm7 +; SSE41-NEXT: psrlw %xmm0, %xmm6 +; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE41-NEXT: psubb %xmm2, %xmm4 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: psllw %xmm4, %xmm3 -; SSE41-NEXT: psllw %xmm4, %xmm6 -; SSE41-NEXT: pshufb %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: psllw %xmm4, %xmm5 +; SSE41-NEXT: pshufb %xmm0, %xmm5 +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: por %xmm6, %xmm3 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -1929,30 +1927,29 @@ ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsllw %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4 @@ -1961,6 +1958,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3 ; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1976,9 +1974,9 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1996,9 +1994,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2017,9 +2015,9 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero @@ -2038,9 +2036,9 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero @@ -2058,9 +2056,9 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2077,9 +2075,9 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2096,9 +2094,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -2111,8 +2109,8 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4 @@ -2126,39 +2124,39 @@ ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE-NEXT: psubb %xmm3, %xmm4 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm1, %xmm5 -; X32-SSE-NEXT: psrlw %xmm3, %xmm5 +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: psrlw %xmm3, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm5, %xmm5 ; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6 ; X32-SSE-NEXT: psrlw %xmm3, %xmm6 -; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm6 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; X32-SSE-NEXT: pand %xmm5, %xmm6 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; X32-SSE-NEXT: pand %xmm4, %xmm3 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X32-SSE-NEXT: psubb %xmm2, %xmm4 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: psllw %xmm4, %xmm0 -; X32-SSE-NEXT: psllw %xmm4, %xmm3 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; X32-SSE-NEXT: pand %xmm0, %xmm3 -; X32-SSE-NEXT: por %xmm6, %xmm3 -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm3, %xmm2 -; X32-SSE-NEXT: por %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE-NEXT: psllw %xmm4, %xmm5 +; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-SSE-NEXT: pand %xmm0, %xmm4 +; X32-SSE-NEXT: por %xmm3, %xmm4 +; X32-SSE-NEXT: pxor %xmm0, %xmm0 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-SSE-NEXT: pand %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm4, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat) Index: llvm/test/CodeGen/X86/vector-fshr-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-256.ll +++ llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -910,35 +910,31 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsllq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -951,10 +947,10 @@ ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -966,9 +962,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -980,10 +977,10 @@ ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -996,10 +993,10 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -1011,9 +1008,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -1031,35 +1029,31 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm4 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpsllq %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpsllq %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 @@ -1076,39 +1070,34 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpsrld %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpslld %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-NEXT: vpslld %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpslld %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1122,11 +1111,11 @@ ; AVX512F-LABEL: splatvar_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1139,10 +1128,11 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VL-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1155,11 +1145,11 @@ ; AVX512BW-LABEL: splatvar_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512BW-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1173,11 +1163,11 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VBMI2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1190,10 +1180,11 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1212,39 +1203,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsrld %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpslld %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; XOPAVX1-NEXT: vpslld %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32] +; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpslld %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; XOPAVX2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; XOPAVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -1262,30 +1248,25 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -1293,10 +1274,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1309,10 +1290,10 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1325,10 +1306,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1342,10 +1323,10 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1359,10 +1340,10 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1375,10 +1356,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1397,39 +1378,34 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm6 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; XOPAVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; XOPAVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; XOPAVX2-NEXT: vpbroadcastw %xmm2, %ymm2 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1447,40 +1423,33 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpsrlw %xmm4, %xmm8, %xmm7 -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpsubb %xmm5, %xmm7, %xmm6 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsllw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsubb %xmm2, %xmm7, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpsubb %xmm2, %xmm6, %xmm6 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpsllw %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpsllw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7 ; AVX1-NEXT: vpsllw %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm9, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -1488,8 +1457,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -1497,6 +1465,7 @@ ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1512,8 +1481,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -1521,6 +1489,7 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1536,8 +1505,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -1545,6 +1513,7 @@ ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -1561,9 +1530,9 @@ ; AVX512BW-LABEL: splatvar_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1581,9 +1550,9 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1600,9 +1569,9 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1618,9 +1587,9 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] @@ -1636,35 +1605,30 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOPAVX1-NEXT: vpsubb %xmm4, %xmm3, %xmm5 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm6 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm4, %xmm8, %xmm7 +; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; XOPAVX1-NEXT: vpshlb %xmm7, %xmm6, %xmm6 -; XOPAVX1-NEXT: vpsubb %xmm2, %xmm8, %xmm7 -; XOPAVX1-NEXT: vpshlb %xmm7, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm5, %ymm0, %ymm0 -; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm4, %xmm4 +; XOPAVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 ; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3 Index: llvm/test/CodeGen/X86/vector-fshr-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-512.ll +++ llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -544,9 +544,10 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0 @@ -557,9 +558,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0 @@ -570,9 +572,10 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0 @@ -590,9 +593,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0 @@ -615,10 +619,11 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -630,10 +635,11 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -645,10 +651,11 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -667,10 +674,11 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero @@ -694,10 +702,10 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -714,10 +722,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero @@ -734,10 +742,10 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -756,10 +764,10 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -783,8 +791,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 ; AVX512F-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 @@ -792,6 +799,7 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm8, %ymm8 ; AVX512F-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -813,8 +821,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 @@ -822,6 +829,7 @@ ; AVX512VL-NEXT: vpsrlw $8, %ymm8, %ymm8 ; AVX512VL-NEXT: vpbroadcastb %xmm8, %ymm8 ; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm4, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero @@ -843,8 +851,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 @@ -852,6 +859,7 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -866,8 +874,7 @@ ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 ; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 @@ -875,6 +882,7 @@ ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -889,8 +897,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 @@ -898,6 +905,7 @@ ; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero @@ -912,8 +920,7 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 ; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 @@ -921,6 +928,7 @@ ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero Index: llvm/test/CodeGen/X86/vector-fshr-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -785,17 +785,17 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -843,26 +843,26 @@ ; ; SSE41-LABEL: splatvar_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: psubd %xmm1, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pslld %xmm1, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32] -; SSE41-NEXT: psubd %xmm2, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pslld %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero ; SSE41-NEXT: psrld %xmm1, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -874,11 +874,11 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -920,17 +920,17 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -956,49 +956,49 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllw %xmm2, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllw %xmm1, %xmm3 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: psubw %xmm1, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw %xmm1, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; SSE41-NEXT: psubw %xmm2, %xmm1 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE41-NEXT: psubw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -1008,10 +1008,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -1023,10 +1023,10 @@ ; ; AVX512-LABEL: splatvar_funnnel_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpsubw %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -1038,37 +1038,37 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; X32-SSE-LABEL: splatvar_funnnel_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm2, %xmm1 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psllw %xmm2, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psllw %xmm1, %xmm3 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm2, %xmm0 ; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer @@ -1079,9 +1079,6 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; SSE2-LABEL: splatvar_funnnel_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: psubb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 @@ -1114,10 +1111,10 @@ ; SSE41-LABEL: splatvar_funnnel_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pshufb %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubb %xmm1, %xmm3 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE41-NEXT: pshufb %xmm2, %xmm3 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllw %xmm4, %xmm1 @@ -1140,9 +1137,9 @@ ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 @@ -1161,10 +1158,10 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 @@ -1258,24 +1255,21 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; X32-SSE-LABEL: splatvar_funnnel_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pxor %xmm2, %xmm2 ; X32-SSE-NEXT: psubb %xmm1, %xmm2 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 Index: llvm/test/CodeGen/X86/vector-fshr-rot-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -628,9 +628,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 @@ -639,9 +639,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2 @@ -656,9 +656,9 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -676,9 +676,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero @@ -720,9 +720,9 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 @@ -731,9 +731,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2 @@ -748,10 +748,10 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -769,9 +769,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -784,9 +784,9 @@ ; ; AVX512-LABEL: splatvar_funnnel_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2 @@ -799,10 +799,10 @@ ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -811,9 +811,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2 @@ -829,8 +829,8 @@ ; AVX1-LABEL: splatvar_funnnel_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 @@ -857,9 +857,9 @@ ; ; AVX2-LABEL: splatvar_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -880,9 +880,9 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -903,9 +903,9 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3 @@ -961,8 +961,8 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 @@ -971,9 +971,9 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2 Index: llvm/test/CodeGen/X86/vector-fshr-rot-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -328,10 +328,10 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpsubw %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -346,10 +346,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm3, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] @@ -399,10 +399,10 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 @@ -427,10 +427,10 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 Index: llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax-widen.ll @@ -1160,11 +1160,9 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1208,9 +1206,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1265,9 +1261,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1335,9 +1329,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1430,9 +1422,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1543,9 +1533,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 Index: llvm/test/CodeGen/X86/vector-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -727,25 +727,23 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: retq ; @@ -753,18 +751,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: retq @@ -772,29 +770,29 @@ ; AVX1-LABEL: test_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -802,10 +800,9 @@ ; AVX512VL-LABEL: test_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> %a0) @@ -1209,15 +1206,12 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1226,9 +1220,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq @@ -1237,18 +1231,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: por %xmm0, %xmm3 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax @@ -1257,11 +1251,10 @@ ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1269,10 +1262,9 @@ ; AVX512BW-LABEL: test_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1281,10 +1273,9 @@ ; AVX512VL-LABEL: test_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: retq @@ -1304,17 +1295,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1325,10 +1313,9 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7] ; SSE41-NEXT: pmaxud %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE41-NEXT: pmaxud %xmm2, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq @@ -1340,10 +1327,9 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1355,10 +1341,9 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1380,9 +1365,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1450,9 +1433,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1545,9 +1526,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1658,9 +1637,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1762,16 +1739,12 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1780,9 +1753,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -1790,19 +1763,19 @@ ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: por %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pextrb $0, %xmm2, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1810,21 +1783,19 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1832,10 +1803,9 @@ ; ; AVX512VL-LABEL: test_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: retq @@ -1855,18 +1825,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1877,11 +1843,10 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pmaxud %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxud %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE41-NEXT: pmaxud %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1891,10 +1856,9 @@ ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1905,10 +1869,9 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -1919,10 +1882,9 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1942,14 +1904,11 @@ ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1964,11 +1923,11 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pmaxuw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmaxuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pmaxuw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1981,10 +1940,10 @@ ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1998,10 +1957,10 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -2015,10 +1974,10 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll +++ llvm/test/CodeGen/X86/vector-reduce-umin-widen.ll @@ -1159,11 +1159,9 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1207,9 +1205,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1264,9 +1260,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1315,9 +1309,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1387,9 +1379,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1475,9 +1465,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 Index: llvm/test/CodeGen/X86/vector-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -726,14 +726,12 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; SSE2-LABEL: test_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -742,9 +740,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: retq ; @@ -752,18 +750,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: retq @@ -771,29 +769,29 @@ ; AVX1-LABEL: test_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpminuq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -801,10 +799,9 @@ ; AVX512VL-LABEL: test_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> %a0) @@ -1208,15 +1205,12 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; SSE2-LABEL: test_v2i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1225,9 +1219,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq @@ -1236,18 +1230,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax @@ -1256,11 +1250,10 @@ ; AVX-LABEL: test_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1268,10 +1261,9 @@ ; AVX512BW-LABEL: test_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1280,10 +1272,9 @@ ; AVX512VL-LABEL: test_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: retq @@ -1303,17 +1294,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1324,10 +1312,9 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3,4,5,6,7] ; SSE41-NEXT: pminud %xmm0, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pminud %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq @@ -1339,10 +1326,9 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -1354,10 +1340,9 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -1379,9 +1364,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1430,9 +1413,7 @@ ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1502,9 +1483,7 @@ ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1590,9 +1569,7 @@ ; SSE2-NEXT: pxor %xmm8, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1668,16 +1645,12 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2-LABEL: test_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] @@ -1686,9 +1659,9 @@ ; SSE2-NEXT: pand %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq @@ -1696,19 +1669,19 @@ ; SSE41-LABEL: test_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: por %xmm0, %xmm3 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pextrb $0, %xmm2, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1716,21 +1689,19 @@ ; ; AVX-LABEL: test_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpextrb $0, %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; ; AVX512BW-LABEL: test_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpminuq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: vzeroupper @@ -1738,10 +1709,9 @@ ; ; AVX512VL-LABEL: test_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpminuq %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: retq @@ -1761,18 +1731,14 @@ ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1783,11 +1749,10 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1797,10 +1762,9 @@ ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1811,10 +1775,9 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -1825,10 +1788,9 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq @@ -1848,14 +1810,11 @@ ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1870,11 +1829,11 @@ ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pminuw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pminuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE41-NEXT: pminuw %xmm0, %xmm1 +; SSE41-NEXT: pextrb $0, %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1887,10 +1846,10 @@ ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: retq @@ -1904,10 +1863,10 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: retq @@ -1921,10 +1880,10 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq Index: llvm/test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rotate-128.ll +++ llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -752,8 +752,8 @@ ; ; SSE41-LABEL: splatvar_rotate_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pslld %xmm2, %xmm3 @@ -766,8 +766,8 @@ ; ; AVX1-LABEL: splatvar_rotate_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -779,9 +779,9 @@ ; ; AVX2-LABEL: splatvar_rotate_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -857,9 +857,9 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_rotate_v8i16: ; SSE2: # %bb.0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -874,9 +874,9 @@ ; ; SSE41-LABEL: splatvar_rotate_v8i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw %xmm2, %xmm3 @@ -889,11 +889,11 @@ ; ; AVX1-LABEL: splatvar_rotate_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -903,8 +903,8 @@ ; ; AVX2-LABEL: splatvar_rotate_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -916,8 +916,8 @@ ; ; AVX512-LABEL: splatvar_rotate_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] @@ -942,9 +942,9 @@ ; ; X32-SSE-LABEL: splatvar_rotate_v8i16: ; X32-SSE: # %bb.0: +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] @@ -967,9 +967,6 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-LABEL: splatvar_rotate_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SSE2-NEXT: psubb %xmm1, %xmm2 @@ -1000,9 +997,9 @@ ; ; SSE41-LABEL: splatvar_rotate_v16i8: ; SSE41: # %bb.0: +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pshufb %xmm3, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psllw %xmm4, %xmm2 @@ -1024,9 +1021,9 @@ ; ; AVX1-LABEL: splatvar_rotate_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 @@ -1045,8 +1042,8 @@ ; ; AVX2-LABEL: splatvar_rotate_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 @@ -1140,9 +1137,6 @@ ; ; X32-SSE-LABEL: splatvar_rotate_v16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X32-SSE-NEXT: psubb %xmm1, %xmm2 Index: llvm/test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -645,17 +645,17 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -848,16 +848,16 @@ ; XOPAVX1-LABEL: splatvar_shift_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -1368,18 +1368,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v8i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v8i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1510,18 +1510,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1652,9 +1652,9 @@ ; ; XOP-LABEL: splatvar_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -930,18 +930,19 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrlq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: psrlq %xmm3, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm0, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrlq %xmm0, %xmm4 +; SSE2-NEXT: psrlq %xmm3, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE2-NEXT: xorpd %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 @@ -996,7 +997,7 @@ ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 @@ -1021,7 +1022,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 @@ -1032,7 +1033,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 @@ -1044,7 +1045,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsraq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 @@ -1083,8 +1084,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad %xmm1, %xmm3 @@ -1107,44 +1109,42 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pslld $16, %xmm0 ; SSE41-NEXT: psrad $16, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrad %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: retq ; @@ -1152,9 +1152,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1162,9 +1161,9 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpslld $16, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq @@ -1173,9 +1172,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpslld $16, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1183,9 +1181,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1193,9 +1190,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -1203,8 +1200,9 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pslld $16, %xmm0 ; X32-SSE-NEXT: psrad $16, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psrad %xmm1, %xmm3 @@ -1307,7 +1305,7 @@ ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 @@ -1333,7 +1331,7 @@ ; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 @@ -1343,7 +1341,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $48, %zmm0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 @@ -1355,7 +1353,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllq $48, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsraq $48, %xmm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vector-shift-lshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -699,16 +699,16 @@ ; XOPAVX1-LABEL: splatvar_shift_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll @@ -1120,18 +1120,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v8i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v8i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1242,18 +1242,18 @@ ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -1364,9 +1364,9 @@ ; ; XOP-LABEL: splatvar_shift_v2i8: ; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -775,10 +775,10 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrlq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -815,7 +815,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -834,7 +834,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -843,7 +843,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -852,7 +852,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -877,18 +877,18 @@ define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm1, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld %xmm1, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 @@ -902,52 +902,52 @@ ; SSE41-LABEL: splatvar_shift_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrld %xmm1, %xmm2 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld %xmm0, %xmm3 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: psrld %xmm0, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; AVX1-NEXT: vpsrld %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -955,8 +955,8 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq @@ -965,8 +965,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; @@ -974,8 +974,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -983,25 +983,25 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld %xmm1, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: psrld %xmm1, %xmm3 +; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrld %xmm4, %xmm1 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrld %xmm3, %xmm4 @@ -1059,7 +1059,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1078,7 +1078,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -1087,7 +1087,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1096,7 +1096,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1210,11 +1210,10 @@ ; ; AVX2-LABEL: splatvar_shift_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -1222,32 +1221,21 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; XOPAVX1-LABEL: splatvar_shift_v8i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v8i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: splatvar_shift_v8i8: +; XOP: # %bb.0: +; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_shift_v8i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1256,8 +1244,7 @@ ; ; AVX512BW-LABEL: splatvar_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1266,11 +1253,10 @@ ; ; AVX512DQVL-LABEL: splatvar_shift_v8i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1278,8 +1264,7 @@ ; ; AVX512BWVL-LABEL: splatvar_shift_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero ; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: retq @@ -1373,8 +1358,7 @@ ; ; AVX1-LABEL: splatvar_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 @@ -1392,42 +1376,37 @@ ; ; AVX2-LABEL: splatvar_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1487,9 +1466,7 @@ ; ; AVX1-LABEL: splatvar_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -1499,43 +1476,37 @@ ; ; AVX2-LABEL: splatvar_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v2i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1.2598673968951787E-321,1.2598673968951787E-321] -; XOPAVX1-NEXT: # xmm2 = mem[0,0] -; XOPAVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; XOPAVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v2i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq Index: llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -564,8 +564,9 @@ define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-LABEL: splatvar_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psllq %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -598,7 +599,7 @@ ; ; AVX2-LABEL: splatvar_shift_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -614,7 +615,7 @@ ; ; XOPAVX2-LABEL: splatvar_shift_v2i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -622,7 +623,7 @@ ; ; AVX512-LABEL: splatvar_shift_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -630,7 +631,7 @@ ; ; AVX512VL-LABEL: splatvar_shift_v2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -688,41 +689,37 @@ ; ; AVX2-LABEL: splatvar_shift_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v4i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_shift_v4i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -782,7 +779,7 @@ ; ; AVX2-LABEL: splatvar_shift_v2i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -798,7 +795,7 @@ ; ; XOPAVX2-LABEL: splatvar_shift_v2i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -806,7 +803,7 @@ ; ; AVX512-LABEL: splatvar_shift_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 @@ -814,7 +811,7 @@ ; ; AVX512VL-LABEL: splatvar_shift_v2i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0