diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40611,11 +40611,57 @@ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ShuffleVT = N.getValueType(); - auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) { + auto IsBinOp = [&TLI](unsigned Opcode) { + switch (Opcode) { + case X86ISD::VSHL: + case X86ISD::VSHLI: + case X86ISD::VSRL: + case X86ISD::VSRLI: + case X86ISD::VSRA: + case X86ISD::VSRAI: + return true; + default: + return TLI.isBinOp(Opcode); + } + }; + auto IsMatchingBinOp = [](SDValue X, SDValue Y) { + if (X.getOpcode() != Y.getOpcode()) + return false; + switch (X.getOpcode()) { + case X86ISD::VSHL: + case X86ISD::VSHLI: + case X86ISD::VSRL: + case X86ISD::VSRLI: + case X86ISD::VSRA: + case X86ISD::VSRAI: + // SSE vector shifts must have matching (scalar) shift amounts. + return X.getOperand(1) == Y.getOperand(1); + default: + return true; + } + }; + auto IgnoreOp = [](unsigned Opcode, unsigned OpNo) { + switch (Opcode) { + case X86ISD::VSHL: + case X86ISD::VSHLI: + case X86ISD::VSRL: + case X86ISD::VSRLI: + case X86ISD::VSRA: + case X86ISD::VSRAI: + return OpNo == 1; + default: + return false; + } + }; + auto IsMergeableWithShuffle = [&DAG, &IgnoreOp](SDValue Src, unsigned OpNo, + bool FoldLoad = false) { // AllZeros/AllOnes constants are freely shuffled and will peek through // bitcasts. Other constant build vectors do not peek through bitcasts. Only // merge with target shuffles if it has one use so shuffle combining is // likely to kick in. Shuffles of splats are expected to be removed. + if (IgnoreOp(Src.getOpcode(), OpNo)) + return true; + SDValue Op = peekThroughOneUseBitcasts(Src.getOperand(OpNo)); return ISD::isBuildVectorAllOnes(Op.getNode()) || ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || @@ -40655,26 +40701,23 @@ N->isOnlyUserOf(N.getOperand(0).getNode())) { SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); unsigned SrcOpcode = N0.getOpcode(); - if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { - SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); - SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); - if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) || - IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) { - SDValue LHS, RHS; - Op00 = DAG.getBitcast(ShuffleVT, Op00); - Op01 = DAG.getBitcast(ShuffleVT, Op01); - if (N.getNumOperands() == 2) { - LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1)); - RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1)); - } else { - LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00); - RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01); + if (IsBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { + EVT OpVT = N0.getValueType(); + if (IsMergeableWithShuffle(N0, 0, Opc != X86ISD::PSHUFB) || + IsMergeableWithShuffle(N0, 1, Opc != X86ISD::PSHUFB)) { + SmallVector Ops(N0->ops()); + for (int i = 0; i != 2; ++i) { + if (IgnoreOp(SrcOpcode, i)) + continue; + Ops[i] = DAG.getBitcast(ShuffleVT, Ops[i]); + Ops[i] = + N.getNumOperands() == 2 + ? DAG.getNode(Opc, DL, ShuffleVT, Ops[i], N.getOperand(1)) + : DAG.getNode(Opc, DL, ShuffleVT, Ops[i]); + Ops[i] = DAG.getBitcast(OpVT, Ops[i]); } - EVT OpVT = N0.getValueType(); return DAG.getBitcast(ShuffleVT, - DAG.getNode(SrcOpcode, DL, OpVT, - DAG.getBitcast(OpVT, LHS), - DAG.getBitcast(OpVT, RHS))); + DAG.getNode(SrcOpcode, DL, OpVT, Ops)); } } } @@ -40700,36 +40743,30 @@ SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); unsigned SrcOpcode = N0.getOpcode(); - if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode && + if (IsBinOp(SrcOpcode) && IsMatchingBinOp(N0, N1) && IsSafeToMoveShuffle(N0, SrcOpcode) && IsSafeToMoveShuffle(N1, SrcOpcode)) { - SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); - SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0)); - SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); - SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1)); + EVT OpVT = N0.getValueType(); // Ensure the total number of shuffles doesn't increase by folding this // shuffle through to the source ops. - if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) || - (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) || - ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) && - (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) { - SDValue LHS, RHS; - Op00 = DAG.getBitcast(ShuffleVT, Op00); - Op10 = DAG.getBitcast(ShuffleVT, Op10); - Op01 = DAG.getBitcast(ShuffleVT, Op01); - Op11 = DAG.getBitcast(ShuffleVT, Op11); - if (N.getNumOperands() == 3) { - LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2)); - RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2)); - } else { - LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10); - RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11); + if (((IsMergeableWithShuffle(N0, 0) && IsMergeableWithShuffle(N1, 0)) || + (IsMergeableWithShuffle(N0, 1) && IsMergeableWithShuffle(N1, 1))) || + ((IsMergeableWithShuffle(N0, 0) || IsMergeableWithShuffle(N1, 0)) && + (IsMergeableWithShuffle(N0, 1) || IsMergeableWithShuffle(N1, 1)))) { + SmallVector Ops(N0->ops()); + for (int i = 0; i != 2; ++i) { + if (IgnoreOp(SrcOpcode, i)) + continue; + SDValue LHS = DAG.getBitcast(ShuffleVT, N0.getOperand(i)); + SDValue RHS = DAG.getBitcast(ShuffleVT, N1.getOperand(i)); + Ops[i] = + N.getNumOperands() == 3 + ? DAG.getNode(Opc, DL, ShuffleVT, LHS, RHS, N.getOperand(2)) + : DAG.getNode(Opc, DL, ShuffleVT, LHS, RHS); + Ops[i] = DAG.getBitcast(OpVT, Ops[i]); } - EVT OpVT = N0.getValueType(); return DAG.getBitcast(ShuffleVT, - DAG.getNode(SrcOpcode, DL, OpVT, - DAG.getBitcast(OpVT, LHS), - DAG.getBitcast(OpVT, RHS))); + DAG.getNode(SrcOpcode, DL, OpVT, Ops)); } } } diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll --- a/llvm/test/CodeGen/X86/abds-vector-128.ll +++ b/llvm/test/CodeGen/X86/abds-vector-128.ll @@ -81,44 +81,36 @@ ; SSE2-NEXT: psubq %xmm10, %xmm7 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE2-NEXT: psubq %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: psubq %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: psubq %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm7 ; SSE2-NEXT: psubq %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm8 ; SSE2-NEXT: psubq %xmm1, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] @@ -466,44 +458,36 @@ ; SSE2-NEXT: psubq %xmm10, %xmm7 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE2-NEXT: psubq %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: psubq %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: psubq %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm7 ; SSE2-NEXT: psubq %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm8 ; SSE2-NEXT: psubq %xmm1, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] @@ -814,24 +798,20 @@ ; SSE2-NEXT: psubq %xmm6, %xmm4 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1012,24 +992,20 @@ ; SSE2-NEXT: psubq %xmm6, %xmm4 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1194,14 +1170,12 @@ ; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: psubq %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] @@ -1299,14 +1273,12 @@ ; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: psubq %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] @@ -1832,9 +1804,8 @@ ; SSE2-LABEL: abd_subnsw_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -64,44 +64,36 @@ ; SSE2-NEXT: psubq %xmm10, %xmm8 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: psubq %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: psubq %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm7 ; SSE2-NEXT: psubq %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm8 ; SSE2-NEXT: psubq %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] @@ -430,44 +422,36 @@ ; SSE2-NEXT: psubq %xmm10, %xmm8 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: psubq %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: psubq %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm7 ; SSE2-NEXT: psubq %xmm1, %xmm7 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm8 ; SSE2-NEXT: psubq %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] @@ -767,24 +751,20 @@ ; SSE2-NEXT: psubq %xmm6, %xmm4 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -956,24 +936,20 @@ ; SSE2-NEXT: psubq %xmm6, %xmm4 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1131,14 +1107,12 @@ ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] @@ -1226,14 +1200,12 @@ ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp.ll @@ -191,8 +191,8 @@ ; KNL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vcmpnltpd %zmm0, %zmm1, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpsrld $31, %ymm0, %ymm1 -; KNL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; KNL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4] +; KNL-NEXT: vpsrld $31, %ymm1, %ymm1 ; KNL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; KNL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq @@ -202,8 +202,8 @@ ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vcmpnltpd %zmm0, %zmm1, %k0 ; SKX-NEXT: vpmovm2d %k0, %ymm0 -; SKX-NEXT: vpsrld $31, %ymm0, %ymm1 -; SKX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; SKX-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4] +; SKX-NEXT: vpsrld $31, %ymm1, %ymm1 ; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; SKX-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/blend-of-shift.ll b/llvm/test/CodeGen/X86/blend-of-shift.ll --- a/llvm/test/CodeGen/X86/blend-of-shift.ll +++ b/llvm/test/CodeGen/X86/blend-of-shift.ll @@ -9,17 +9,15 @@ define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_shl_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psllw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_shl_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15) @@ -31,17 +29,15 @@ define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_lshr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $15, %xmm0 -; SSE2-NEXT: psrlw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_lshr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15) @@ -53,17 +49,15 @@ define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_ashr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_ashr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15) @@ -76,17 +70,15 @@ define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_shl_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_shl_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31) %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31) @@ -96,17 +88,15 @@ define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_lshr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: psrld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_lshr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $31, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31) %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31) @@ -116,17 +106,15 @@ define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_ashr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_ashr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31) %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31) @@ -255,10 +243,9 @@ define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psllw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i16: @@ -277,14 +264,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) @@ -296,10 +283,9 @@ define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_ashr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_ashr_i16: @@ -319,10 +305,9 @@ define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i32: @@ -341,10 +326,9 @@ define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: psrld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i32: @@ -363,10 +347,9 @@ define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_ashr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_ashr_i32: @@ -386,10 +369,9 @@ define <2 x i64> @shuffle_i64_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psllq $63, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psllq $63, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i64: @@ -408,10 +390,9 @@ define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $63, %xmm0 -; SSE2-NEXT: psrlq $63, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrlq $63, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i64: diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll --- a/llvm/test/CodeGen/X86/combine-abs.ll +++ b/llvm/test/CodeGen/X86/combine-abs.ll @@ -107,14 +107,12 @@ define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) { ; SSE2-LABEL: combine_v4i64_abs_abs: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psubq %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psubq %xmm2, %xmm1 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -325,9 +325,8 @@ define <2 x i64> @combine_mul_to_abs_v2i64(<2 x i64> %x) { ; SSE-LABEL: combine_mul_to_abs_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1514,9 +1514,8 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: psrlq $62, %xmm1 ; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: psrlq $2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -1599,16 +1598,14 @@ ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: psrlq $62, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] ; SSE2-NEXT: psrad $2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE2-NEXT: psrlq $2, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrlq $61, %xmm3 ; SSE2-NEXT: psrlq $60, %xmm2 @@ -1634,9 +1631,8 @@ ; SSE41-NEXT: psrlq $2, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlq $60, %xmm3 ; SSE41-NEXT: psrlq $61, %xmm2 @@ -1735,9 +1731,8 @@ ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: psrlq $62, %xmm4 ; SSE2-NEXT: paddq %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] ; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] ; SSE2-NEXT: psrlq $2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] @@ -1746,16 +1741,14 @@ ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: psrlq $62, %xmm4 ; SSE2-NEXT: paddq %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] ; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] ; SSE2-NEXT: psrlq $2, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrlq $61, %xmm5 ; SSE2-NEXT: psrlq $60, %xmm4 @@ -1768,9 +1761,8 @@ ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] ; SSE2-NEXT: xorpd %xmm4, %xmm1 ; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: psrlq $61, %xmm6 ; SSE2-NEXT: psrlq $60, %xmm5 @@ -1804,9 +1796,8 @@ ; SSE41-NEXT: psrlq $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: psrlq $60, %xmm5 ; SSE41-NEXT: psrlq $61, %xmm4 @@ -1819,9 +1810,8 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] ; SSE41-NEXT: pxor %xmm4, %xmm1 ; SSE41-NEXT: psubq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psrlq $60, %xmm6 ; SSE41-NEXT: psrlq $61, %xmm5 diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -430,25 +430,22 @@ define <4 x float> @PR45794(<2 x i64> %x, <2 x i64> %y) { ; SSE-LABEL: PR45794: ; SSE: # %bb.0: -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: PR45794: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR45794: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -541,8 +541,8 @@ define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind { ; X86-LABEL: freeze_ashr_vec_outofrange: ; X86: # %bb.0: -; X86-NEXT: psrad $1, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; X86-NEXT: psrad $1, %xmm0 ; X86-NEXT: psrad $2, %xmm0 ; X86-NEXT: retl ; @@ -650,8 +650,8 @@ define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind { ; X86-LABEL: freeze_lshr_vec_outofrange: ; X86: # %bb.0: -; X86-NEXT: psrld $1, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; X86-NEXT: psrld $1, %xmm0 ; X86-NEXT: psrld $2, %xmm0 ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -114,9 +114,8 @@ ; X86-LABEL: signbits_ashr_sitofp_1: ; X86: # %bb.0: ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vpsrad $16, %xmm1, %xmm1 -; X86-NEXT: vpsrad $16, %xmm0, %xmm0 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; X86-NEXT: vpsrad $16, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -124,9 +123,8 @@ ; X64-AVX1-LABEL: signbits_ashr_sitofp_1: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; X64-AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq @@ -192,8 +190,8 @@ ; X86-LABEL: signbits_ashr_shl_extract_sitofp: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpsrad $29, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-NEXT: vpsrad $29, %xmm0, %xmm0 ; X86-NEXT: vpsllq $20, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) @@ -203,8 +201,8 @@ ; ; X64-LABEL: signbits_ashr_shl_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpsrad $29, %xmm0, %xmm0 ; X64-NEXT: vpsllq $20, %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -338,8 +336,8 @@ ; X86-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpsrad $29, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-NEXT: vpsrad $29, %xmm0, %xmm0 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 @@ -350,8 +348,8 @@ ; ; X64-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpsrad $29, %xmm0, %xmm0 ; X64-NEXT: vmovd %edi, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 @@ -371,8 +369,8 @@ ; X86-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpsrad $29, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-NEXT: vpsrad $29, %xmm0, %xmm0 ; X86-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) @@ -382,8 +380,8 @@ ; ; X64-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vpsrad $29, %xmm0, %xmm0 ; X64-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -407,13 +405,13 @@ ; X86-NEXT: subl $16, %esp ; X86-NEXT: vmovapd 8(%ebp), %xmm3 ; X86-NEXT: vpsrad $31, %xmm2, %xmm4 -; X86-NEXT: vpsrad $1, %xmm2, %xmm5 -; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; X86-NEXT: vpsrad $1, %xmm5, %xmm5 ; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] ; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 ; X86-NEXT: vpsrad $31, %xmm2, %xmm5 -; X86-NEXT: vpsrad $1, %xmm2, %xmm2 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-NEXT: vpsrad $1, %xmm2, %xmm2 ; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] ; X86-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 @@ -435,13 +433,13 @@ ; X64-AVX1-LABEL: signbits_ashr_sext_select_shuffle_sitofp: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 -; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm5 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; X64-AVX1-NEXT: vpsrad $1, %xmm5, %xmm5 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; X64-AVX1-NEXT: vpsrad $31, %xmm2, %xmm5 -; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3] ; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 @@ -460,8 +458,8 @@ ; ; X64-AVX2-LABEL: signbits_ashr_sext_select_shuffle_sitofp: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; X64-AVX2-NEXT: vpsrad $1, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -9,10 +9,10 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind { ; SSE-LABEL: trunc_ashr_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: psrad $31, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; @@ -42,17 +42,15 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) { ; SSE-LABEL: trunc_ashr_v4i64_bitcast: ; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: psrad $17, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: psrad $17, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: ret{{[l|q]}} @@ -61,12 +59,12 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $17, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpsrad $17, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -75,8 +73,8 @@ ; AVX2-LABEL: trunc_ashr_v4i64_bitcast: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr32907.ll b/llvm/test/CodeGen/X86/pr32907.ll --- a/llvm/test/CodeGen/X86/pr32907.ll +++ b/llvm/test/CodeGen/X86/pr32907.ll @@ -8,9 +8,8 @@ ; SSE2-LABEL: PR32907: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: psubq %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -37,8 +37,8 @@ ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,1,3,3] ; SSE2-NEXT: psllq $63, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -135,22 +135,22 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) { ; XOPAVX1-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_mask_ashr1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = ashr <4 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1134,13 +1134,13 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i64: @@ -1162,13 +1162,13 @@ ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i64: @@ -1244,108 +1244,106 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-LABEL: v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pandn %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 ; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: paddq %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: paddq %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pandn %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 ; SSSE3-NEXT: paddq %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i64: @@ -1448,204 +1446,202 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-LABEL: v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm10, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-NEXT: pxor %xmm10, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm10 -; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: paddq %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm8, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: paddq %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 ; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-NEXT: pxor %xmm11, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm11, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 ; SSE2-NEXT: paddq %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 ; SSE2-NEXT: paddq %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 +; SSE2-NEXT: pxor %xmm5, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pxor %xmm11, %xmm5 +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm9 +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: paddq %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: movdqa %xmm9, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: paddq %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: pxor %xmm10, %xmm9 +; SSSE3-NEXT: movdqa %xmm0, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: pxor %xmm9, %xmm9 ; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 -; SSSE3-NEXT: pxor %xmm10, %xmm11 -; SSSE3-NEXT: movdqa %xmm11, %xmm10 -; SSSE3-NEXT: pandn %xmm0, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm9, %xmm0 -; SSSE3-NEXT: pand %xmm11, %xmm0 -; SSSE3-NEXT: por %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: paddq %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm11 -; SSSE3-NEXT: pxor %xmm8, %xmm11 -; SSSE3-NEXT: movdqa %xmm10, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm11, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: paddq %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 ; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pxor %xmm11, %xmm10 -; SSSE3-NEXT: movdqa %xmm10, %xmm5 -; SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: pand %xmm10, %xmm1 +; SSSE3-NEXT: pxor %xmm11, %xmm1 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm5 ; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 ; SSSE3-NEXT: paddq %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: movdqa %xmm5, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] ; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm10, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm11, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 ; SSSE3-NEXT: paddq %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm3, %xmm8 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm10 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 +; SSSE3-NEXT: pxor %xmm5, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pxor %xmm11, %xmm5 +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm9 +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -579,9 +579,8 @@ ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; X64-NEXT: psllq $32, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; X64-NEXT: psrad $31, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; X64-NEXT: psrlq $31, %xmm3 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -703,9 +702,8 @@ ; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[0,1,1,3] ; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; X64-NEXT: psrad $31, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X64-NEXT: psrlq $31, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/select-sra.ll b/llvm/test/CodeGen/X86/select-sra.ll --- a/llvm/test/CodeGen/X86/select-sra.ll +++ b/llvm/test/CodeGen/X86/select-sra.ll @@ -89,8 +89,8 @@ define <2 x i64> @isnonneg_v2i64(<2 x i64> %x) { ; CHECK-LABEL: isnonneg_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %cond = icmp sgt <2 x i64> %x, @@ -186,8 +186,8 @@ define <2 x i64> @isneg_v2i64(<2 x i64> %x) { ; CHECK-LABEL: isneg_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %cond = icmp slt <2 x i64> %x, zeroinitializer diff --git a/llvm/test/CodeGen/X86/shift-logic.ll b/llvm/test/CodeGen/X86/shift-logic.ll --- a/llvm/test/CodeGen/X86/shift-logic.ll +++ b/llvm/test/CodeGen/X86/shift-logic.ll @@ -110,15 +110,13 @@ define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: ashr_or: ; CHECK: # %bb.0: -; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; CHECK-NEXT: psrad $7, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-NEXT: psrlq $7, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; CHECK-NEXT: psrad $12, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; CHECK-NEXT: psrlq $12, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1921,9 +1921,7 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: psllq $32, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1944,9 +1942,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: psllq $32, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) diff --git a/llvm/test/CodeGen/X86/shuffle-of-shift.ll b/llvm/test/CodeGen/X86/shuffle-of-shift.ll --- a/llvm/test/CodeGen/X86/shuffle-of-shift.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-shift.ll @@ -9,14 +9,14 @@ define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x) nounwind { ; SSE2-LABEL: shuffle_i32_of_shl_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_shl_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15) %i2 = bitcast <8 x i16> %i1 to <4 x i32> @@ -26,14 +26,14 @@ define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x) nounwind { ; SSE2-LABEL: shuffle_i32_of_lshr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_lshr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) %i2 = bitcast <8 x i16> %i1 to <4 x i32> @@ -43,14 +43,14 @@ define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x) nounwind { ; SSE2-LABEL: shuffle_i32_of_ashr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psraw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE2-NEXT: psraw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_ashr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15) %i2 = bitcast <8 x i16> %i1 to <4 x i32> @@ -61,14 +61,14 @@ define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x) nounwind { ; SSE2-LABEL: shuffle_i32_of_shl_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_shl_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31) %i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> @@ -77,14 +77,14 @@ define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x) nounwind { ; SSE2-LABEL: shuffle_i32_of_lshr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE2-NEXT: psrld $31, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_lshr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31) %i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> @@ -93,14 +93,14 @@ define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x) nounwind { ; SSE2-LABEL: shuffle_i32_of_ashr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_ashr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31) %i2 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> @@ -186,14 +186,14 @@ define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15) %i2 = bitcast <8 x i16> %i1 to <2 x i64> @@ -203,14 +203,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) %i2 = bitcast <8 x i16> %i1 to <2 x i64> @@ -220,14 +220,14 @@ define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x) nounwind { ; SSE2-LABEL: shuffle_i64_of_ashr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psraw $15, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: psraw $15, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_ashr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15) %i2 = bitcast <8 x i16> %i1 to <2 x i64> @@ -238,14 +238,14 @@ define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31) %i2 = bitcast <4 x i32> %i1 to <2 x i64> @@ -255,14 +255,14 @@ define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: psrld $31, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31) %i2 = bitcast <4 x i32> %i1 to <2 x i64> @@ -272,14 +272,14 @@ define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x) nounwind { ; SSE2-LABEL: shuffle_i64_of_ashr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_ashr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31) %i2 = bitcast <4 x i32> %i1 to <2 x i64> diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1210,13 +1210,13 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v2i64: @@ -1245,13 +1245,13 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm0, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v2i64: @@ -1341,134 +1341,132 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE2-LABEL: v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: psubq %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: psubq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 ; SSE2-NEXT: psubq %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: psubq %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: psubq %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 ; SSSE3-NEXT: psubq %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSSE3-NEXT: pand %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v4i64: @@ -1592,254 +1590,252 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE2-LABEL: v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm9, %xmm0 +; SSE2-NEXT: psubq %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm11 +; SSE2-NEXT: pxor %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm9, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: psubq %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: psubq %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm11 +; SSE2-NEXT: por %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: psubq %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm2 ; SSE2-NEXT: psubq %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: por %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: movdqa %xmm9, %xmm11 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSSE3-NEXT: pxor %xmm9, %xmm0 +; SSSE3-NEXT: psubq %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm10 +; SSSE3-NEXT: movdqa %xmm0, %xmm11 ; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm9 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm0, %xmm10 +; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm9 -; SSSE3-NEXT: pxor %xmm10, %xmm9 -; SSSE3-NEXT: movdqa %xmm9, %xmm10 -; SSSE3-NEXT: pandn %xmm0, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm11 +; SSSE3-NEXT: pxor %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm9, %xmm0 -; SSSE3-NEXT: por %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: psubq %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm10 -; SSSE3-NEXT: movdqa %xmm9, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: pand %xmm11, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm11 +; SSSE3-NEXT: por %xmm11, %xmm0 +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: psubq %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm9 -; SSSE3-NEXT: pxor %xmm10, %xmm9 -; SSSE3-NEXT: movdqa %xmm9, %xmm5 -; SSSE3-NEXT: pandn %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm11 +; SSSE3-NEXT: pxor %xmm4, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: pand %xmm11, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm11 +; SSSE3-NEXT: por %xmm11, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 ; SSSE3-NEXT: psubq %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pandn %xmm2, %xmm8 +; SSSE3-NEXT: por %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm2 ; SSSE3-NEXT: psubq %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pandn %xmm3, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: por %xmm6, %xmm3 +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: v8i64: diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll --- a/llvm/test/CodeGen/X86/vec_shift5.ll +++ b/llvm/test/CodeGen/X86/vec_shift5.ll @@ -258,8 +258,8 @@ ; CHECK-NEXT: movd %xmm1, %ecx ; CHECK-NEXT: addl $3, %ecx ; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: psrad %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: psrad %xmm1, %xmm0 ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: imull %ecx, %eax ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -1777,12 +1777,12 @@ ; SSE2-NEXT: pinsrw $6, %eax, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] ; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE2-NEXT: psllq $63, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i1_to_4i64: @@ -1807,12 +1807,12 @@ ; SSSE3-NEXT: pinsrw $6, %eax, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] ; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSSE3-NEXT: psllq $63, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i1_to_4i64: @@ -1837,12 +1837,12 @@ ; SSE41-NEXT: movzbl %al, %eax ; SSE41-NEXT: pinsrb $12, %eax, %xmm1 ; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE41-NEXT: psllq $63, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_4i1_to_4i64: @@ -1941,12 +1941,12 @@ ; X86-SSE2-NEXT: pinsrw $6, %eax, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] ; X86-SSE2-NEXT: psllq $63, %xmm0 -; X86-SSE2-NEXT: psrad $31, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: psrad $31, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; X86-SSE2-NEXT: psllq $63, %xmm1 -; X86-SSE2-NEXT: psrad $31, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: psrad $31, %xmm1 ; X86-SSE2-NEXT: retl ; ; X86-SSE41-LABEL: load_sext_4i1_to_4i64: @@ -1972,12 +1972,12 @@ ; X86-SSE41-NEXT: movzbl %al, %eax ; X86-SSE41-NEXT: pinsrb $12, %eax, %xmm1 ; X86-SSE41-NEXT: psllq $63, %xmm0 -; X86-SSE41-NEXT: psrad $31, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE41-NEXT: psrad $31, %xmm0 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; X86-SSE41-NEXT: psllq $63, %xmm1 -; X86-SSE41-NEXT: psrad $31, %xmm1 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE41-NEXT: psrad $31, %xmm1 ; X86-SSE41-NEXT: retl entry: %X = load <4 x i1>, ptr %ptr @@ -3683,38 +3683,34 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE2-NEXT: psllq $58, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: psrad $26, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE2-NEXT: psllq $58, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: psrad $26, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE2-NEXT: psllq $58, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; SSE2-NEXT: psrad $26, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; SSE2-NEXT: psllq $58, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; SSE2-NEXT: psrad $26, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE2-NEXT: retq ; @@ -3727,38 +3723,34 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSSE3-NEXT: psllq $58, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSSE3-NEXT: psrad $26, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSSE3-NEXT: psllq $58, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSSE3-NEXT: psrad $26, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] ; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSSE3-NEXT: psllq $58, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; SSSE3-NEXT: psrad $26, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; SSSE3-NEXT: psllq $58, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; SSSE3-NEXT: psrad $26, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSSE3-NEXT: retq ; @@ -3768,36 +3760,32 @@ ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: psrad $26, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: psrad $26, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: psrad $26, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: psllq $58, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: psrad $26, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] ; SSE41-NEXT: retq ; @@ -3851,38 +3839,34 @@ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; X86-SSE2-NEXT: psllq $58, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrad $31, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: psrad $26, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; X86-SSE2-NEXT: psllq $58, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: psrad $31, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; X86-SSE2-NEXT: psrad $26, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; X86-SSE2-NEXT: psllq $58, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: psrad $31, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; X86-SSE2-NEXT: psrad $26, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; X86-SSE2-NEXT: psllq $58, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: psrad $31, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; X86-SSE2-NEXT: psrad $26, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; X86-SSE2-NEXT: retl ; @@ -3892,36 +3876,32 @@ ; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; X86-SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; X86-SSE41-NEXT: psllq $58, %xmm0 -; X86-SSE41-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; X86-SSE41-NEXT: psllq $58, %xmm1 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; X86-SSE41-NEXT: psrad $31, %xmm1 ; X86-SSE41-NEXT: psrad $26, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; X86-SSE41-NEXT: psllq $58, %xmm1 -; X86-SSE41-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X86-SSE41-NEXT: psllq $58, %xmm2 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: psrad $31, %xmm2 ; X86-SSE41-NEXT: psrad $26, %xmm1 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; X86-SSE41-NEXT: psllq $58, %xmm2 -; X86-SSE41-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; X86-SSE41-NEXT: psllq $58, %xmm4 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE41-NEXT: psrad $31, %xmm4 ; X86-SSE41-NEXT: psrad $26, %xmm2 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; X86-SSE41-NEXT: psllq $58, %xmm3 -; X86-SSE41-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; X86-SSE41-NEXT: psllq $58, %xmm4 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; X86-SSE41-NEXT: psrad $31, %xmm4 ; X86-SSE41-NEXT: psrad $26, %xmm3 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; X86-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] ; X86-SSE41-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1385,9 +1385,9 @@ ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movapd %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: psraw $2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: andps %xmm0, %xmm1 @@ -1456,9 +1456,9 @@ ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; X86-SSE-NEXT: movapd %xmm1, %xmm2 ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] +; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE-NEXT: psraw $2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] ; X86-SSE-NEXT: movaps %xmm2, %xmm1 ; X86-SSE-NEXT: andps %xmm0, %xmm1 @@ -1571,9 +1571,8 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: splatconstant_shift_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; SSE2-NEXT: psrad $7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: psrlq $7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1621,9 +1620,8 @@ ; ; X86-SSE-LABEL: splatconstant_shift_v2i64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; X86-SSE-NEXT: psrad $7, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE-NEXT: psrlq $7, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1712,12 +1712,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $4, %xmm1 -; SSE2-NEXT: psrad $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: psrad $4, %xmm0 +; SSE2-NEXT: psrad $5, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i32: @@ -1762,12 +1760,10 @@ ; ; X86-SSE-LABEL: constant_shift_v2i32: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrad $4, %xmm1 -; X86-SSE-NEXT: psrad $5, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: psrad $4, %xmm0 +; X86-SSE-NEXT: psrad $5, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: retl %shift = ashr <2 x i32> %a, ret <2 x i32> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1418,12 +1418,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $4, %xmm1 -; SSE2-NEXT: psrld $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: psrld $4, %xmm0 +; SSE2-NEXT: psrld $5, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i32: @@ -1468,12 +1466,10 @@ ; ; X86-SSE-LABEL: constant_shift_v2i32: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psrld $4, %xmm1 -; X86-SSE-NEXT: psrld $5, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: psrld $4, %xmm0 +; X86-SSE-NEXT: psrld $5, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: retl %shift = lshr <2 x i32> %a, ret <2 x i32> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1258,12 +1258,10 @@ define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $4, %xmm1 -; SSE2-NEXT: pslld $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pslld $4, %xmm0 +; SSE2-NEXT: pslld $5, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i32: @@ -1308,12 +1306,10 @@ ; ; X86-SSE-LABEL: constant_shift_v2i32: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: pslld $4, %xmm1 -; X86-SSE-NEXT: pslld $5, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pslld $4, %xmm0 +; X86-SSE-NEXT: pslld $5, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: retl %shift = shl <2 x i32> %a, ret <2 x i32> %shift diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -3100,11 +3100,10 @@ ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: psrad $31, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrad $31, %xmm1 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: @@ -3126,11 +3125,10 @@ ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X64-SSE2-NEXT: psrad $31, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrad $31, %xmm1 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: @@ -3155,53 +3153,51 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_33(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrad $31, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: psrad $1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33: ; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrad $31, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X64-SSE2-NEXT: psrad $1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_33: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, @@ -3211,53 +3207,51 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrad $31, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: psrad $2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34: ; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: psrad $31, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X64-SSE2-NEXT: psrad $2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_34: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, diff --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll --- a/llvm/test/CodeGen/X86/viabs.ll +++ b/llvm/test/CodeGen/X86/viabs.ll @@ -524,18 +524,16 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_abs_ge_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubq %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_ge_v2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: psubq %xmm1, %xmm0 ; SSSE3-NEXT: retq @@ -574,28 +572,24 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind { ; SSE2-LABEL: test_abs_gt_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psubq %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psubq %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_gt_v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: psubq %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSSE3-NEXT: psubq %xmm2, %xmm1 ; SSSE3-NEXT: retq @@ -643,48 +637,40 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind { ; SSE2-LABEL: test_abs_le_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: psubq %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v8i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: psubq %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: psubq %xmm4, %xmm3 ; SSSE3-NEXT: retq @@ -751,24 +737,20 @@ ; SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu 32(%rdi), %xmm2 ; SSE2-NEXT: movdqu 48(%rdi), %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: psubq %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq @@ -779,24 +761,20 @@ ; SSSE3-NEXT: movdqu 16(%rdi), %xmm1 ; SSSE3-NEXT: movdqu 32(%rdi), %xmm2 ; SSSE3-NEXT: movdqu 48(%rdi), %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: psubq %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: psubq %xmm4, %xmm3 ; SSSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll --- a/llvm/test/CodeGen/X86/vselect-zero.ll +++ b/llvm/test/CodeGen/X86/vselect-zero.ll @@ -308,8 +308,8 @@ define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: signbit_mask_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -343,8 +343,8 @@ define <2 x i64> @signbit_mask_swap_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: signbit_mask_swap_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -521,11 +521,11 @@ define <4 x i64> @signbit_mask_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: signbit_mask_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: retq ; @@ -673,8 +673,8 @@ define <2 x i64> @signbit_setmask_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: signbit_setmask_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -815,11 +815,11 @@ define <4 x i64> @signbit_setmask_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: signbit_setmask_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; @@ -867,11 +867,11 @@ define <4 x i64> @signbit_setmask_swap_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: signbit_setmask_swap_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -484,8 +484,8 @@ ; SSE2-LABEL: shrunkblend_2uses: ; SSE2: # %bb.0: ; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm1 @@ -523,8 +523,8 @@ ; SSE2-LABEL: shrunkblend_nonvselectuse: ; SSE2: # %bb.0: ; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm1 @@ -536,8 +536,8 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: psllq $63, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vshift-3.ll b/llvm/test/CodeGen/X86/vshift-3.ll --- a/llvm/test/CodeGen/X86/vshift-3.ll +++ b/llvm/test/CodeGen/X86/vshift-3.ll @@ -11,20 +11,20 @@ ; X86-LABEL: shift1a: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X86-NEXT: psrad $31, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-NEXT: movdqa %xmm1, (%eax) +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrad $31, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: shift1a: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X64-NEXT: psrad $31, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movdqa %xmm1, (%rdi) +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrad $31, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: %ashr = ashr <2 x i64> %val, < i64 32, i64 32 > diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll --- a/llvm/test/CodeGen/X86/vsplit-and.ll +++ b/llvm/test/CodeGen/X86/vsplit-and.ll @@ -43,8 +43,8 @@ ; CHECK-NEXT: andnps %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; CHECK-NEXT: psllq $63, %xmm0 -; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pmovsxdq %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%rdi) ; CHECK-NEXT: movq %xmm0, 16(%rdi)