Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17687,6 +17687,15 @@ SDValue In = N->getOperand(0); assert(In.getValueType().isVector() && "Must concat vectors"); + // If the input is a concat_vectors, just make a larger concat by padding + // with smaller undefs. + if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) { + unsigned NumOps = N->getNumOperands() * In.getNumOperands(); + SmallVector Ops(In->op_begin(), In->op_end()); + Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType())); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); + } + SDValue Scalar = peekThroughOneUseBitcasts(In); // concat_vectors(scalar_to_vector(scalar), undef) -> Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44492,6 +44492,20 @@ } } + // If we are extracting from an insert into a zero vector, replace with a + // smaller insert into zero if we don't access less than the original + // subvector. Don't do this for i1 vectors. + if (VT.getVectorElementType() != MVT::i1 && + InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && + InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && + ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && + InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) { + SDLoc DL(N); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), + InVec.getOperand(1), InVec.getOperand(2)); + } + // If we're extracting from a broadcast then we're better off just // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. Index: llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -2707,7 +2707,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2734,7 +2734,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2764,7 +2764,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2796,7 +2796,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2828,7 +2828,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2859,7 +2859,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7590,7 +7590,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7617,7 +7617,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7647,7 +7647,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7679,7 +7679,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7711,7 +7711,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7742,7 +7742,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12533,7 +12533,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12560,7 +12560,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12590,7 +12590,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12622,7 +12622,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12654,7 +12654,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12685,7 +12685,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17496,7 +17496,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17523,7 +17523,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17553,7 +17553,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17585,7 +17585,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17617,7 +17617,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17648,7 +17648,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21407,7 +21407,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21434,7 +21434,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21462,7 +21462,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21493,7 +21493,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21524,7 +21524,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21556,7 +21556,7 @@ ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: andl $3, %eax +; NoVLX-NEXT: movzbl %al, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: Index: llvm/test/CodeGen/X86/oddshuffles.ll =================================================================== --- llvm/test/CodeGen/X86/oddshuffles.ll +++ llvm/test/CodeGen/X86/oddshuffles.ll @@ -1513,34 +1513,34 @@ ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovupd (%rsi), %ymm0 -; AVX1-NEXT: vmovups 16(%rcx), %xmm1 -; AVX1-NEXT: vmovups (%rdx), %xmm2 -; AVX1-NEXT: vmovups 16(%rdx), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm1[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,1],xmm4[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vmovups (%rdx), %xmm1 +; AVX1-NEXT: vmovups 16(%rdx), %xmm2 ; AVX1-NEXT: vmovups (%rsi), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm2[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vmovups 16(%rcx), %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm2, (%rdi) -; AVX1-NEXT: vmovups %ymm1, 64(%rdi) +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) +; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1549,17 +1549,17 @@ ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3] +; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] @@ -1567,8 +1567,8 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1577,27 +1577,27 @@ ; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rdi) -; AVX2-FAST-NEXT: vmovups %ymm2, 32(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1605,32 +1605,32 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovupd (%rsi), %ymm0 ; XOP-NEXT: vmovups (%rcx), %ymm1 -; XOP-NEXT: vmovups 16(%rcx), %xmm2 -; XOP-NEXT: vmovups (%rdx), %xmm3 -; XOP-NEXT: vmovups 16(%rdx), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] -; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vmovups (%rdx), %xmm2 +; XOP-NEXT: vmovups 16(%rdx), %xmm3 ; XOP-NEXT: vmovups (%rsi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vmovups 16(%rcx), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] +; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, (%rdi) -; XOP-NEXT: vmovups %ymm2, 64(%rdi) +; XOP-NEXT: vmovups %ymm3, 64(%rdi) +; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, <8 x i32>* %q1, align 4 Index: llvm/test/CodeGen/X86/vec_saddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_saddo.ll +++ llvm/test/CodeGen/X86/vec_saddo.ll @@ -1791,48 +1791,48 @@ ; ; AVX512-LABEL: saddo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq %rcx, %r14 -; AVX512-NEXT: adcq %r11, %r14 -; AVX512-NEXT: setns %bl -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: cmpb %bl, %cl -; AVX512-NEXT: setne %bl -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: setns %al -; AVX512-NEXT: cmpb %al, %cl -; AVX512-NEXT: sete %al -; AVX512-NEXT: andb %bl, %al -; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: testq %r9, %r9 ; AVX512-NEXT: setns %al ; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: cmpb %al, %cl -; AVX512-NEXT: sete %al +; AVX512-NEXT: setns %bl +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: sete %bpl ; AVX512-NEXT: addq %r8, %rdi ; AVX512-NEXT: adcq %r9, %rsi +; AVX512-NEXT: setns %al +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %al +; AVX512-NEXT: andb %bpl, %al +; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq %rcx, %rbp +; AVX512-NEXT: adcq %r10, %rbp +; AVX512-NEXT: setns %bl +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %bl, %cl +; AVX512-NEXT: setne %r8b +; AVX512-NEXT: testq %r10, %r10 ; AVX512-NEXT: setns %bl ; AVX512-NEXT: cmpb %bl, %cl -; AVX512-NEXT: setne %cl -; AVX512-NEXT: andb %al, %cl -; AVX512-NEXT: andl $1, %ecx -; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: sete %cl +; AVX512-NEXT: andb %r8b, %cl +; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r10) -; AVX512-NEXT: movq %rdi, (%r10) -; AVX512-NEXT: movq %r14, 24(%r10) -; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: movq %rdx, 16(%r11) +; AVX512-NEXT: movq %rdi, (%r11) +; AVX512-NEXT: movq %rbp, 24(%r11) +; AVX512-NEXT: movq %rsi, 8(%r11) ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 Index: llvm/test/CodeGen/X86/vec_smulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_smulo.ll +++ llvm/test/CodeGen/X86/vec_smulo.ll @@ -2605,9 +2605,9 @@ ; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: setne %cl +; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: andl $1, %ecx ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 Index: llvm/test/CodeGen/X86/vec_ssubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_ssubo.ll +++ llvm/test/CodeGen/X86/vec_ssubo.ll @@ -1830,48 +1830,48 @@ ; ; AVX512-LABEL: ssubo_v2i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq %rcx, %r14 -; AVX512-NEXT: sbbq %r11, %r14 -; AVX512-NEXT: setns %bl -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: setns %cl -; AVX512-NEXT: cmpb %bl, %cl -; AVX512-NEXT: setne %bl -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: setns %al -; AVX512-NEXT: cmpb %al, %cl -; AVX512-NEXT: setne %al -; AVX512-NEXT: andb %bl, %al -; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: testq %r9, %r9 ; AVX512-NEXT: setns %al ; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: setns %cl -; AVX512-NEXT: cmpb %al, %cl -; AVX512-NEXT: setne %al +; AVX512-NEXT: setns %bl +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %bpl ; AVX512-NEXT: subq %r8, %rdi ; AVX512-NEXT: sbbq %r9, %rsi +; AVX512-NEXT: setns %al +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %al +; AVX512-NEXT: andb %bpl, %al +; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq %rcx, %rbp +; AVX512-NEXT: sbbq %r10, %rbp +; AVX512-NEXT: setns %bl +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %bl, %cl +; AVX512-NEXT: setne %r8b +; AVX512-NEXT: testq %r10, %r10 ; AVX512-NEXT: setns %bl ; AVX512-NEXT: cmpb %bl, %cl ; AVX512-NEXT: setne %cl -; AVX512-NEXT: andb %al, %cl -; AVX512-NEXT: andl $1, %ecx -; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: andb %r8b, %cl +; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %rdx, 16(%r10) -; AVX512-NEXT: movq %rdi, (%r10) -; AVX512-NEXT: movq %r14, 24(%r10) -; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: movq %rdx, 16(%r11) +; AVX512-NEXT: movq %rdi, (%r11) +; AVX512-NEXT: movq %rbp, 24(%r11) +; AVX512-NEXT: movq %rsi, 8(%r11) ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 Index: llvm/test/CodeGen/X86/vec_uaddo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_uaddo.ll +++ llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1282,16 +1282,16 @@ ; AVX512-LABEL: uaddo_v2i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: adcq %r9, %rsi +; AVX512-NEXT: setb %r8b ; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: setb %al ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: addq %r8, %rdi -; AVX512-NEXT: adcq %r9, %rsi -; AVX512-NEXT: setb %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: andl $1, %r8d +; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -2451,66 +2451,68 @@ ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: movq %rdx, %r12 -; AVX512-NEXT: movq %rdi, %r11 +; AVX512-NEXT: movq %r9, %r10 +; AVX512-NEXT: movq %rcx, %r9 +; AVX512-NEXT: movq %rdx, %r11 +; AVX512-NEXT: movq %rsi, %rax +; AVX512-NEXT: movq %rdi, %rsi ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; AVX512-NEXT: testq %r10, %r10 ; AVX512-NEXT: setne %dl -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: setne %r13b -; AVX512-NEXT: andb %dl, %r13b -; AVX512-NEXT: mulq %r15 -; AVX512-NEXT: movq %rax, %rdi +; AVX512-NEXT: testq %rax, %rax +; AVX512-NEXT: setne %bl +; AVX512-NEXT: andb %dl, %bl +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: movq %rax, %r13 ; AVX512-NEXT: seto %bpl ; AVX512-NEXT: movq %r10, %rax -; AVX512-NEXT: mulq %r12 -; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: mulq %rdi +; AVX512-NEXT: movq %rax, %rdi ; AVX512-NEXT: seto %cl ; AVX512-NEXT: orb %bpl, %cl -; AVX512-NEXT: addq %rdi, %rbx -; AVX512-NEXT: movq %r12, %rax -; AVX512-NEXT: mulq %r15 -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq %rdx, %r15 -; AVX512-NEXT: addq %rbx, %r15 -; AVX512-NEXT: setb %al -; AVX512-NEXT: orb %cl, %al -; AVX512-NEXT: orb %r13b, %al -; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: setne %al -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: setne %cl -; AVX512-NEXT: andb %al, %cl +; AVX512-NEXT: addq %r13, %rdi ; AVX512-NEXT: movq %rsi, %rax ; AVX512-NEXT: mulq %r8 -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: seto %bpl +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq %rdx, %r10 +; AVX512-NEXT: addq %rdi, %r10 +; AVX512-NEXT: setb %sil +; AVX512-NEXT: orb %cl, %sil +; AVX512-NEXT: orb %bl, %sil +; AVX512-NEXT: testq %r12, %r12 +; AVX512-NEXT: setne %al +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: setne %bpl +; AVX512-NEXT: andb %al, %bpl ; AVX512-NEXT: movq %r9, %rax -; AVX512-NEXT: mulq %r11 +; AVX512-NEXT: mulq %r15 ; AVX512-NEXT: movq %rax, %rdi -; AVX512-NEXT: seto %bl -; AVX512-NEXT: orb %bpl, %bl -; AVX512-NEXT: addq %rsi, %rdi +; AVX512-NEXT: seto %r9b +; AVX512-NEXT: movq %r12, %rax +; AVX512-NEXT: mulq %r11 +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: seto %cl +; AVX512-NEXT: orb %r9b, %cl +; AVX512-NEXT: addq %rdi, %rbx ; AVX512-NEXT: movq %r11, %rax -; AVX512-NEXT: mulq %r8 -; AVX512-NEXT: addq %rdi, %rdx -; AVX512-NEXT: setb %sil -; AVX512-NEXT: orb %bl, %sil -; AVX512-NEXT: orb %cl, %sil +; AVX512-NEXT: mulq %r15 +; AVX512-NEXT: addq %rbx, %rdx +; AVX512-NEXT: setb %dil +; AVX512-NEXT: orb %cl, %dil +; AVX512-NEXT: orb %bpl, %dil +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: andl $1, %esi ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: movq %r10, 16(%r14) -; AVX512-NEXT: movq %rax, (%r14) -; AVX512-NEXT: movq %r15, 24(%r14) -; AVX512-NEXT: movq %rdx, 8(%r14) +; AVX512-NEXT: movq %rax, 16(%r14) +; AVX512-NEXT: movq %r8, (%r14) +; AVX512-NEXT: movq %rdx, 24(%r14) +; AVX512-NEXT: movq %r10, 8(%r14) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 Index: llvm/test/CodeGen/X86/vec_usubo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_usubo.ll +++ llvm/test/CodeGen/X86/vec_usubo.ll @@ -1329,16 +1329,16 @@ ; AVX512-LABEL: usubo_v2i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: subq %r8, %rdi +; AVX512-NEXT: sbbq %r9, %rsi +; AVX512-NEXT: setb %r8b ; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: setb %al ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 -; AVX512-NEXT: subq %r8, %rdi -; AVX512-NEXT: sbbq %r9, %rsi -; AVX512-NEXT: setb %al -; AVX512-NEXT: andl $1, %eax -; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: andl $1, %r8d +; AVX512-NEXT: kmovw %r8d, %k1 ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}