Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1089,6 +1089,10 @@ SDValue Cond = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); + if (N->getOpcode() == ISD::VSELECT) + if (SDValue Res = WidenVSELECT_SETCC(N)) + return Res; + // Promote all the way up to the canonical SetCC type. EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; Cond = PromoteTargetBoolean(Cond, OpVT); Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -724,6 +724,7 @@ SDValue WidenVecRes_UNDEF(SDNode *N); SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N); SDValue WidenVecRes_VSETCC(SDNode* N); + SDValue WidenVSELECT_SETCC(SDNode *N); SDValue WidenVecRes_Ternary(SDNode *N); SDValue WidenVecRes_Binary(SDNode *N); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2859,6 +2859,83 @@ WidenVT, N->getOperand(0)); } +// This method is needed to handle the case where either the SetCCResultType +// or the VSELECT VT needs widening. It only handles the case where all the +// target vector registers are of the same size and hold a varying number of +// elements depending on the element size. +SDValue DAGTypeLegalizer::WidenVSELECT_SETCC(SDNode *N) { + LLVMContext &Ctx = *DAG.getContext(); + SDValue Cond = N->getOperand(0); + + // Currently only handling the common case where Cond is a SETCC. + // TODO: Handle also operands other than SETCC that are produced by + // SimplifySetCC(). + if (N->getOpcode() != ISD::VSELECT || Cond->getOpcode() != ISD::SETCC) + return SDValue(); + + // Get the VTs for the SETCC and VSELECT, and widen them and the VSELECT + // operands if needed. + EVT SetccVT = getSetCCResultType(Cond->getOperand(0).getValueType()); + if (getTypeAction(SetccVT) == TargetLowering::TypeWidenVector) + SetccVT = TLI.getTypeToTransformTo(Ctx, SetccVT); + + EVT VSelVT = N->getValueType(0); + SDValue VSelOp1 = N->getOperand(1); + SDValue VSelOp2 = N->getOperand(2); + if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) { + VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT); + VSelOp1 = GetWidenedVector(VSelOp1); + VSelOp2 = GetWidenedVector(VSelOp2); + } + + // The mask of the VSELECT should have integer elements. + EVT MaskVT = VSelVT; + if (!MaskVT.getScalarType().isInteger()) + MaskVT = MaskVT.changeVectorElementTypeToInteger(); + + // SetccVT and VSelVT must be of same size if they are both legal. + if (getTypeAction(SetccVT) == TargetLowering::TypeLegal && + getTypeAction(VSelVT) == TargetLowering::TypeLegal && + SetccVT.getSizeInBits() != VSelVT.getSizeInBits()) + return SDValue(); + + // If SetccVT has smaller elements than the VSELECT, a + // SIGN_EXTEND_VECTOR_INREG is made. + if ((SetccVT.getScalarSizeInBits() < MaskVT.getScalarSizeInBits()) && + ((SetccVT.getSizeInBits() != MaskVT.getSizeInBits()) || + (SetccVT.getVectorNumElements() <= MaskVT.getVectorNumElements()))) + return SDValue(); + + // Make a new SETCC node, with a legal VT. + Cond = DAG.getNode(ISD::SETCC, SDLoc(Cond), SetccVT, + Cond->getOperand(0), Cond->getOperand(1), Cond->getOperand(2)); + + // After widening, the new SetCCVT and MaskVT types may not have the same + // VTs. Handle as needed with vector sign extension or truncatation. + SDValue Mask; + if (SetccVT == MaskVT) + Mask = Cond; + else if (SetccVT.getScalarSizeInBits() < MaskVT.getScalarSizeInBits()) + Mask = DAG.getSignExtendVectorInReg(Cond, SDLoc(N), MaskVT); + else { + // SetccVT has wider elements: truncate. + EVT SubVT = + EVT::getVectorVT(Ctx, MaskVT.getVectorElementType(), + SetccVT.getVectorNumElements()); + SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, SDLoc(N), SubVT, Cond); + + unsigned NumSubVecs = VSelVT.getVectorNumElements() / SubVT.getVectorNumElements(); + SmallVector SubConcatOps(NumSubVecs); + SubConcatOps[0] = TruncCC; + for (unsigned i = 1; i < NumSubVecs; ++i) + SubConcatOps[i] = DAG.getUNDEF(SubVT); + + Mask = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MaskVT, SubConcatOps); + } + + return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2); +} + SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned WidenNumElts = WidenVT.getVectorNumElements(); @@ -2866,6 +2943,9 @@ SDValue Cond1 = N->getOperand(0); EVT CondVT = Cond1.getValueType(); if (CondVT.isVector()) { + if (SDValue Res = WidenVSELECT_SETCC(N)) + return Res; + EVT CondEltVT = CondVT.getVectorElementType(); EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenNumElts); Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -326,25 +326,21 @@ ; CHECK: @ BB#0: ; CHECK-NEXT: .save {r4, r5, r11, lr} ; CHECK-NEXT: push {r4, r5, r11, lr} -; CHECK-NEXT: add r12, sp, #48 -; CHECK-NEXT: add lr, sp, #16 ; CHECK-NEXT: add r4, sp, #64 ; CHECK-NEXT: add r5, sp, #32 +; CHECK-NEXT: add r12, sp, #48 +; CHECK-NEXT: add lr, sp, #16 ; CHECK-NEXT: vld1.64 {d16, d17}, [r5] ; CHECK-NEXT: vld1.64 {d18, d19}, [r4] ; CHECK-NEXT: vld1.64 {d20, d21}, [lr] ; CHECK-NEXT: vld1.64 {d22, d23}, [r12] ; CHECK-NEXT: vcgt.u32 q8, q9, q8 ; CHECK-NEXT: vcgt.u32 q9, q11, q10 -; CHECK-NEXT: vmovn.i32 d16, q8 -; CHECK-NEXT: vmovn.i32 d17, q9 -; CHECK-NEXT: vmov.i8 d18, #0x7 -; CHECK-NEXT: vmov d19, r0, r1 -; CHECK-NEXT: vuzp.8 d17, d16 -; CHECK-NEXT: vneg.s8 d16, d18 -; CHECK-NEXT: vshl.i8 d17, d17, #7 +; CHECK-NEXT: vmovn.i32 d17, q8 +; CHECK-NEXT: vmovn.i32 d16, q9 ; CHECK-NEXT: vmov d18, r2, r3 -; CHECK-NEXT: vshl.s8 d16, d17, d16 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vmovn.i16 d16, q8 ; CHECK-NEXT: vbsl d16, d19, d18 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: pop {r4, r5, r11, lr} Index: test/CodeGen/SystemZ/vec-cmpsel.ll =================================================================== --- /dev/null +++ test/CodeGen/SystemZ/vec-cmpsel.ll @@ -0,0 +1,1584 @@ +; Test that vector compare / select combinations do not produce any +; unnecessary pack /unpack / shift instructions. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define <2 x i8> @fun0(<2 x i8> %val1, <2 x i8> %val2, + <2 x i8> %val3, <2 x i8> %val4) { + %cmp = icmp eq <2 x i8> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4 + ret <2 x i8> %sel + +; CHECK-LABEL: fun0: +; CHECK-NOT: vuphb +; CHECK-NOT: vuphh +; CHECK-NOT: vuphf +; CHECK-NOT: vrepih +; CHECK-NOT: vperm +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <2 x i16> @fun1(<2 x i8> %val1, <2 x i8> %val2, + <2 x i16> %val3, <2 x i16> %val4) { + %cmp = icmp eq <2 x i8> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4 + ret <2 x i16> %sel + +; CHECK-LABEL: fun1: +; CHECK-NOT: vuphh +; CHECK-NOT: vuphf +; CHECK-NOT: vperm +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <2 x i32> @fun2(<2 x i8> %val1, <2 x i8> %val2, + <2 x i32> %val3, <2 x i32> %val4) { + %cmp = icmp eq <2 x i8> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4 + ret <2 x i32> %sel + +; CHECK-LABEL: fun2: +; CHECK-NOT: vuphf +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x i64> @fun3(<2 x i8> %val1, <2 x i8> %val2, + <2 x i64> %val3, <2 x i64> %val4) { + %cmp = icmp eq <2 x i8> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %sel + +; CHECK-LABEL: fun3: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x float> @fun4(<2 x i8> %val1, <2 x i8> %val2, + <2 x float> %val3, <2 x float> %val4) { + %cmp = icmp eq <2 x i8> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4 + ret <2 x float> %sel + +; CHECK-LABEL: fun4: +; CHECK-NOT: vuphf +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x double> @fun5(<2 x i8> %val1, <2 x i8> %val2, + <2 x double> %val3, <2 x double> %val4) { + %cmp = icmp eq <2 x i8> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4 + ret <2 x double> %sel + +; CHECK-LABEL: fun5: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x i8> @fun6(<2 x i16> %val1, <2 x i16> %val2, + <2 x i8> %val3, <2 x i8> %val4) { + %cmp = icmp eq <2 x i16> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4 + ret <2 x i8> %sel + +; CHECK-LABEL: fun6: +; CHECK-NOT: vuphh +; CHECK-NOT: vuphf +; CHECK-NOT: vrepih +; CHECK-NOT: vperm +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <2 x i16> @fun7(<2 x i16> %val1, <2 x i16> %val2, + <2 x i16> %val3, <2 x i16> %val4) { + %cmp = icmp eq <2 x i16> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4 + ret <2 x i16> %sel + +; CHECK-LABEL: fun7: +; CHECK-NOT: vuphh +; CHECK-NOT: vuphf +; CHECK-NOT: vperm +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <2 x i32> @fun8(<2 x i16> %val1, <2 x i16> %val2, + <2 x i32> %val3, <2 x i32> %val4) { + %cmp = icmp eq <2 x i16> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4 + ret <2 x i32> %sel + +; CHECK-LABEL: fun8: +; CHECK-NOT: vuphf +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x i64> @fun9(<2 x i16> %val1, <2 x i16> %val2, + <2 x i64> %val3, <2 x i64> %val4) { + %cmp = icmp eq <2 x i16> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %sel + +; CHECK-LABEL: fun9: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x float> @fun10(<2 x i16> %val1, <2 x i16> %val2, + <2 x float> %val3, <2 x float> %val4) { + %cmp = icmp eq <2 x i16> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4 + ret <2 x float> %sel + +; CHECK-LABEL: fun10: +; CHECK-NOT: vuphf +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x double> @fun11(<2 x i16> %val1, <2 x i16> %val2, + <2 x double> %val3, <2 x double> %val4) { + %cmp = icmp eq <2 x i16> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4 + ret <2 x double> %sel + +; CHECK-LABEL: fun11: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x i8> @fun12(<2 x i32> %val1, <2 x i32> %val2, + <2 x i8> %val3, <2 x i8> %val4) { + %cmp = icmp eq <2 x i32> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4 + ret <2 x i8> %sel + +; CHECK-LABEL: fun12: +; CHECK-NOT: vrepih +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <2 x i16> @fun13(<2 x i32> %val1, <2 x i32> %val2, + <2 x i16> %val3, <2 x i16> %val4) { + %cmp = icmp eq <2 x i32> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4 + ret <2 x i16> %sel + +; CHECK-LABEL: fun13: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <2 x i64> @fun14(<2 x i32> %val1, <2 x i32> %val2, + <2 x i64> %val3, <2 x i64> %val4) { + %cmp = icmp eq <2 x i32> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %sel + +; CHECK-LABEL: fun14: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x double> @fun15(<2 x i32> %val1, <2 x i32> %val2, + <2 x double> %val3, <2 x double> %val4) { + %cmp = icmp eq <2 x i32> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4 + ret <2 x double> %sel + +; CHECK-LABEL: fun15: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x i8> @fun16(<2 x i64> %val1, <2 x i64> %val2, + <2 x i8> %val3, <2 x i8> %val4) { + %cmp = icmp eq <2 x i64> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4 + ret <2 x i8> %sel + +; CHECK-LABEL: fun16: +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <2 x i16> @fun17(<2 x i64> %val1, <2 x i64> %val2, + <2 x i16> %val3, <2 x i16> %val4) { + %cmp = icmp eq <2 x i64> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4 + ret <2 x i16> %sel + +; CHECK-LABEL: fun17: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <2 x i32> @fun18(<2 x i64> %val1, <2 x i64> %val2, + <2 x i32> %val3, <2 x i32> %val4) { + %cmp = icmp eq <2 x i64> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4 + ret <2 x i32> %sel + +; CHECK-LABEL: fun18: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x float> @fun19(<2 x i64> %val1, <2 x i64> %val2, + <2 x float> %val3, <2 x float> %val4) { + %cmp = icmp eq <2 x i64> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4 + ret <2 x float> %sel + +; CHECK-LABEL: fun19: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x i8> @fun20(<4 x i8> %val1, <4 x i8> %val2, + <4 x i8> %val3, <4 x i8> %val4) { + %cmp = icmp eq <4 x i8> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4 + ret <4 x i8> %sel + +; CHECK-LABEL: fun20: +; CHECK-NOT: vuphb +; CHECK-NOT: vuphh +; CHECK-NOT: vperm +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <4 x i16> @fun21(<4 x i8> %val1, <4 x i8> %val2, + <4 x i16> %val3, <4 x i16> %val4) { + %cmp = icmp eq <4 x i8> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4 + ret <4 x i16> %sel + +; CHECK-LABEL: fun21: +; CHECK-NOT: vuphh +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <4 x i32> @fun22(<4 x i8> %val1, <4 x i8> %val2, + <4 x i32> %val3, <4 x i32> %val4) { + %cmp = icmp eq <4 x i8> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %sel + +; CHECK-LABEL: fun22: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x i64> @fun23(<4 x i8> %val1, <4 x i8> %val2, + <4 x i64> %val3, <4 x i64> %val4) { + %cmp = icmp eq <4 x i8> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4 + ret <4 x i64> %sel + +; CHECK-LABEL: fun23: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <4 x float> @fun24(<4 x i8> %val1, <4 x i8> %val2, + <4 x float> %val3, <4 x float> %val4) { + %cmp = icmp eq <4 x i8> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %sel + +; CHECK-LABEL: fun24: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x double> @fun25(<4 x i8> %val1, <4 x i8> %val2, + <4 x double> %val3, <4 x double> %val4) { + %cmp = icmp eq <4 x i8> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4 + ret <4 x double> %sel + +; CHECK-LABEL: fun25: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <4 x i8> @fun26(<4 x i16> %val1, <4 x i16> %val2, + <4 x i8> %val3, <4 x i8> %val4) { + %cmp = icmp eq <4 x i16> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4 + ret <4 x i8> %sel + +; CHECK-LABEL: fun26: +; CHECK-NOT: vuphh +; CHECK-NOT: vperm +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <4 x i16> @fun27(<4 x i16> %val1, <4 x i16> %val2, + <4 x i16> %val3, <4 x i16> %val4) { + %cmp = icmp eq <4 x i16> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4 + ret <4 x i16> %sel + +; CHECK-LABEL: fun27: +; CHECK-NOT: vuphh +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <4 x i32> @fun28(<4 x i16> %val1, <4 x i16> %val2, + <4 x i32> %val3, <4 x i32> %val4) { + %cmp = icmp eq <4 x i16> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %sel + +; CHECK-LABEL: fun28: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x i64> @fun29(<4 x i16> %val1, <4 x i16> %val2, + <4 x i64> %val3, <4 x i64> %val4) { + %cmp = icmp eq <4 x i16> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4 + ret <4 x i64> %sel + +; CHECK-LABEL: fun29: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <4 x float> @fun30(<4 x i16> %val1, <4 x i16> %val2, + <4 x float> %val3, <4 x float> %val4) { + %cmp = icmp eq <4 x i16> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %sel + +; CHECK-LABEL: fun30: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x double> @fun31(<4 x i16> %val1, <4 x i16> %val2, + <4 x double> %val3, <4 x double> %val4) { + %cmp = icmp eq <4 x i16> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4 + ret <4 x double> %sel + +; CHECK-LABEL: fun31: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <4 x i8> @fun32(<4 x i32> %val1, <4 x i32> %val2, + <4 x i8> %val3, <4 x i8> %val4) { + %cmp = icmp eq <4 x i32> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4 + ret <4 x i8> %sel + +; CHECK-LABEL: fun32: +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <4 x i16> @fun33(<4 x i32> %val1, <4 x i32> %val2, + <4 x i16> %val3, <4 x i16> %val4) { + %cmp = icmp eq <4 x i32> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4 + ret <4 x i16> %sel + +; CHECK-LABEL: fun33: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <4 x i64> @fun34(<4 x i32> %val1, <4 x i32> %val2, + <4 x i64> %val3, <4 x i64> %val4) { + %cmp = icmp eq <4 x i32> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4 + ret <4 x i64> %sel + +; CHECK-LABEL: fun34: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <4 x double> @fun35(<4 x i32> %val1, <4 x i32> %val2, + <4 x double> %val3, <4 x double> %val4) { + %cmp = icmp eq <4 x i32> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4 + ret <4 x double> %sel + +; CHECK-LABEL: fun35: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <4 x i8> @fun36(<4 x i64> %val1, <4 x i64> %val2, + <4 x i8> %val3, <4 x i8> %val4) { + %cmp = icmp eq <4 x i64> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4 + ret <4 x i8> %sel + +; CHECK-LABEL: fun36: +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <4 x i16> @fun37(<4 x i64> %val1, <4 x i64> %val2, + <4 x i16> %val3, <4 x i16> %val4) { + %cmp = icmp eq <4 x i64> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4 + ret <4 x i16> %sel + +; CHECK-LABEL: fun37: +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <4 x i32> @fun38(<4 x i64> %val1, <4 x i64> %val2, + <4 x i32> %val3, <4 x i32> %val4) { + %cmp = icmp eq <4 x i64> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %sel + +; CHECK-LABEL: fun38: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x float> @fun39(<4 x i64> %val1, <4 x i64> %val2, + <4 x float> %val3, <4 x float> %val4) { + %cmp = icmp eq <4 x i64> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %sel + +; CHECK-LABEL: fun39: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <8 x i8> @fun40(<8 x i8> %val1, <8 x i8> %val2, + <8 x i8> %val3, <8 x i8> %val4) { + %cmp = icmp eq <8 x i8> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4 + ret <8 x i8> %sel + +; CHECK-LABEL: fun40: +; CHECK-NOT: vuphb +; CHECK-NOT: vpkh +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <8 x i16> @fun41(<8 x i8> %val1, <8 x i8> %val2, + <8 x i16> %val3, <8 x i16> %val4) { + %cmp = icmp eq <8 x i8> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %sel + +; CHECK-LABEL: fun41: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <8 x i32> @fun42(<8 x i8> %val1, <8 x i8> %val2, + <8 x i32> %val3, <8 x i32> %val4) { + %cmp = icmp eq <8 x i8> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4 + ret <8 x i32> %sel + +; CHECK-LABEL: fun42: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +} + +define <8 x i64> @fun43(<8 x i8> %val1, <8 x i8> %val2, + <8 x i64> %val3, <8 x i64> %val4) { + %cmp = icmp eq <8 x i8> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4 + ret <8 x i64> %sel + +; CHECK-LABEL: fun43: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <8 x float> @fun44(<8 x i8> %val1, <8 x i8> %val2, + <8 x float> %val3, <8 x float> %val4) { + %cmp = icmp eq <8 x i8> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4 + ret <8 x float> %sel + +; CHECK-LABEL: fun44: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +} + +define <8 x double> @fun45(<8 x i8> %val1, <8 x i8> %val2, + <8 x double> %val3, <8 x double> %val4) { + %cmp = icmp eq <8 x i8> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4 + ret <8 x double> %sel + +; CHECK-LABEL: fun45: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <8 x i8> @fun46(<8 x i16> %val1, <8 x i16> %val2, + <8 x i8> %val3, <8 x i8> %val4) { + %cmp = icmp eq <8 x i16> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4 + ret <8 x i8> %sel + +; CHECK-LABEL: fun46: +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <8 x i32> @fun47(<8 x i16> %val1, <8 x i16> %val2, + <8 x i32> %val3, <8 x i32> %val4) { + %cmp = icmp eq <8 x i16> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4 + ret <8 x i32> %sel + +; CHECK-LABEL: fun47: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +} + +define <8 x i64> @fun48(<8 x i16> %val1, <8 x i16> %val2, + <8 x i64> %val3, <8 x i64> %val4) { + %cmp = icmp eq <8 x i16> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4 + ret <8 x i64> %sel + +; CHECK-LABEL: fun48: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <8 x float> @fun49(<8 x i16> %val1, <8 x i16> %val2, + <8 x float> %val3, <8 x float> %val4) { + %cmp = icmp eq <8 x i16> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4 + ret <8 x float> %sel + +; CHECK-LABEL: fun49: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +} + +define <8 x double> @fun50(<8 x i16> %val1, <8 x i16> %val2, + <8 x double> %val3, <8 x double> %val4) { + %cmp = icmp eq <8 x i16> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4 + ret <8 x double> %sel + +; CHECK-LABEL: fun50: +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: vesrag +} + +define <8 x i8> @fun51(<8 x i32> %val1, <8 x i32> %val2, + <8 x i8> %val3, <8 x i8> %val4) { + %cmp = icmp eq <8 x i32> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4 + ret <8 x i8> %sel + +; CHECK-LABEL: fun51: +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vpkh +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <8 x i16> @fun52(<8 x i32> %val1, <8 x i32> %val2, + <8 x i16> %val3, <8 x i16> %val4) { + %cmp = icmp eq <8 x i32> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %sel + +; CHECK-LABEL: fun52: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <8 x i64> @fun53(<8 x i32> %val1, <8 x i32> %val2, + <8 x i64> %val3, <8 x i64> %val4) { + %cmp = icmp eq <8 x i32> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4 + ret <8 x i64> %sel + +; CHECK-LABEL: fun53: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <8 x double> @fun54(<8 x i32> %val1, <8 x i32> %val2, + <8 x double> %val3, <8 x double> %val4) { + %cmp = icmp eq <8 x i32> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4 + ret <8 x double> %sel + +; CHECK-LABEL: fun54: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <8 x i8> @fun55(<8 x i64> %val1, <8 x i64> %val2, + <8 x i8> %val3, <8 x i8> %val4) { + %cmp = icmp eq <8 x i64> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4 + ret <8 x i8> %sel + +; CHECK-LABEL: fun55: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vpkh +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <8 x i16> @fun56(<8 x i64> %val1, <8 x i64> %val2, + <8 x i16> %val3, <8 x i16> %val4) { + %cmp = icmp eq <8 x i64> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %sel + +; CHECK-LABEL: fun56: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <8 x i32> @fun57(<8 x i64> %val1, <8 x i64> %val2, + <8 x i32> %val3, <8 x i32> %val4) { + %cmp = icmp eq <8 x i64> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4 + ret <8 x i32> %sel + +; CHECK-LABEL: fun57: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <8 x float> @fun58(<8 x i64> %val1, <8 x i64> %val2, + <8 x float> %val3, <8 x float> %val4) { + %cmp = icmp eq <8 x i64> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4 + ret <8 x float> %sel + +; CHECK-LABEL: fun58: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <16 x i16> @fun59(<16 x i8> %val1, <16 x i8> %val2, + <16 x i16> %val3, <16 x i16> %val4) { + %cmp = icmp eq <16 x i8> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4 + ret <16 x i16> %sel + +; CHECK-LABEL: fun59: +; CHECK-NOT: veslh +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vesrah +} + +define <16 x i32> @fun60(<16 x i8> %val1, <16 x i8> %val2, + <16 x i32> %val3, <16 x i32> %val4) { + %cmp = icmp eq <16 x i8> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4 + ret <16 x i32> %sel + +; CHECK-LABEL: fun60: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +} + +define <16 x i64> @fun61(<16 x i8> %val1, <16 x i8> %val2, + <16 x i64> %val3, <16 x i64> %val4) { + %cmp = icmp eq <16 x i8> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4 + ret <16 x i64> %sel + +; CHECK-LABEL: fun61: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x float> @fun62(<16 x i8> %val1, <16 x i8> %val2, + <16 x float> %val3, <16 x float> %val4) { + %cmp = icmp eq <16 x i8> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4 + ret <16 x float> %sel + +; CHECK-LABEL: fun62: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +} + +define <16 x double> @fun63(<16 x i8> %val1, <16 x i8> %val2, + <16 x double> %val3, <16 x double> %val4) { + %cmp = icmp eq <16 x i8> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4 + ret <16 x double> %sel + +; CHECK-LABEL: fun63: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x i8> @fun64(<16 x i16> %val1, <16 x i16> %val2, + <16 x i8> %val3, <16 x i8> %val4) { + %cmp = icmp eq <16 x i16> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %sel + +; CHECK-LABEL: fun64: +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <16 x i32> @fun65(<16 x i16> %val1, <16 x i16> %val2, + <16 x i32> %val3, <16 x i32> %val4) { + %cmp = icmp eq <16 x i16> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4 + ret <16 x i32> %sel + +; CHECK-LABEL: fun65: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <16 x i64> @fun66(<16 x i16> %val1, <16 x i16> %val2, + <16 x i64> %val3, <16 x i64> %val4) { + %cmp = icmp eq <16 x i16> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4 + ret <16 x i64> %sel + +; CHECK-LABEL: fun66: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x float> @fun67(<16 x i16> %val1, <16 x i16> %val2, + <16 x float> %val3, <16 x float> %val4) { + %cmp = icmp eq <16 x i16> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4 + ret <16 x float> %sel + +; CHECK-LABEL: fun67: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <16 x double> @fun68(<16 x i16> %val1, <16 x i16> %val2, + <16 x double> %val3, <16 x double> %val4) { + %cmp = icmp eq <16 x i16> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4 + ret <16 x double> %sel + +; CHECK-LABEL: fun68: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x i8> @fun69(<16 x i32> %val1, <16 x i32> %val2, + <16 x i8> %val3, <16 x i8> %val4) { + %cmp = icmp eq <16 x i32> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %sel + +; CHECK-LABEL: fun69: +; CHECK-NOT: veslh +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vesrah +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <16 x i16> @fun70(<16 x i32> %val1, <16 x i32> %val2, + <16 x i16> %val3, <16 x i16> %val4) { + %cmp = icmp eq <16 x i32> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4 + ret <16 x i16> %sel + +; CHECK-LABEL: fun70: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <16 x i64> @fun71(<16 x i32> %val1, <16 x i32> %val2, + <16 x i64> %val3, <16 x i64> %val4) { + %cmp = icmp eq <16 x i32> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4 + ret <16 x i64> %sel + +; CHECK-LABEL: fun71: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x double> @fun72(<16 x i32> %val1, <16 x i32> %val2, + <16 x double> %val3, <16 x double> %val4) { + %cmp = icmp eq <16 x i32> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4 + ret <16 x double> %sel + +; CHECK-LABEL: fun72: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x i8> @fun73(<16 x i64> %val1, <16 x i64> %val2, + <16 x i8> %val3, <16 x i8> %val4) { + %cmp = icmp eq <16 x i64> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %sel + +; CHECK-LABEL: fun73: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vesrah +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <16 x i16> @fun74(<16 x i64> %val1, <16 x i64> %val2, + <16 x i16> %val3, <16 x i16> %val4) { + %cmp = icmp eq <16 x i64> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4 + ret <16 x i16> %sel + +; CHECK-LABEL: fun74: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <16 x i32> @fun75(<16 x i64> %val1, <16 x i64> %val2, + <16 x i32> %val3, <16 x i32> %val4) { + %cmp = icmp eq <16 x i64> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4 + ret <16 x i32> %sel + +; CHECK-LABEL: fun75: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <16 x float> @fun76(<16 x i64> %val1, <16 x i64> %val2, + <16 x float> %val3, <16 x float> %val4) { + %cmp = icmp eq <16 x i64> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4 + ret <16 x float> %sel + +; CHECK-LABEL: fun76: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x i8> @fun77(<2 x float> %val1, <2 x float> %val2, + <2 x i8> %val3, <2 x i8> %val4) { + %cmp = fcmp ogt <2 x float> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4 + ret <2 x i8> %sel + +; CHECK-LABEL: fun77: +; CHECK-NOT: vrepih +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <2 x i16> @fun78(<2 x float> %val1, <2 x float> %val2, + <2 x i16> %val3, <2 x i16> %val4) { + %cmp = fcmp ogt <2 x float> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4 + ret <2 x i16> %sel + +; CHECK-LABEL: fun78: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <2 x i64> @fun79(<2 x float> %val1, <2 x float> %val2, + <2 x i64> %val3, <2 x i64> %val4) { + %cmp = fcmp ogt <2 x float> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4 + ret <2 x i64> %sel + +; CHECK-LABEL: fun79: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x double> @fun80(<2 x float> %val1, <2 x float> %val2, + <2 x double> %val3, <2 x double> %val4) { + %cmp = fcmp ogt <2 x float> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4 + ret <2 x double> %sel + +; CHECK-LABEL: fun80: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <2 x i8> @fun81(<2 x double> %val1, <2 x double> %val2, + <2 x i8> %val3, <2 x i8> %val4) { + %cmp = fcmp ogt <2 x double> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4 + ret <2 x i8> %sel + +; CHECK-LABEL: fun81: +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <2 x i16> @fun82(<2 x double> %val1, <2 x double> %val2, + <2 x i16> %val3, <2 x i16> %val4) { + %cmp = fcmp ogt <2 x double> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4 + ret <2 x i16> %sel + +; CHECK-LABEL: fun82: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <2 x i32> @fun83(<2 x double> %val1, <2 x double> %val2, + <2 x i32> %val3, <2 x i32> %val4) { + %cmp = fcmp ogt <2 x double> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4 + ret <2 x i32> %sel + +; CHECK-LABEL: fun83: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <2 x float> @fun84(<2 x double> %val1, <2 x double> %val2, + <2 x float> %val3, <2 x float> %val4) { + %cmp = fcmp ogt <2 x double> %val1, %val2 + %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4 + ret <2 x float> %sel + +; CHECK-LABEL: fun84: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x i8> @fun85(<4 x float> %val1, <4 x float> %val2, + <4 x i8> %val3, <4 x i8> %val4) { + %cmp = fcmp ogt <4 x float> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4 + ret <4 x i8> %sel + +; CHECK-LABEL: fun85: +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <4 x i16> @fun86(<4 x float> %val1, <4 x float> %val2, + <4 x i16> %val3, <4 x i16> %val4) { + %cmp = fcmp ogt <4 x float> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4 + ret <4 x i16> %sel + +; CHECK-LABEL: fun86: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <4 x i64> @fun87(<4 x float> %val1, <4 x float> %val2, + <4 x i64> %val3, <4 x i64> %val4) { + %cmp = fcmp ogt <4 x float> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4 + ret <4 x i64> %sel + +; CHECK-LABEL: fun87: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <4 x double> @fun88(<4 x float> %val1, <4 x float> %val2, + <4 x double> %val3, <4 x double> %val4) { + %cmp = fcmp ogt <4 x float> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4 + ret <4 x double> %sel + +; CHECK-LABEL: fun88: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <4 x i8> @fun89(<4 x double> %val1, <4 x double> %val2, + <4 x i8> %val3, <4 x i8> %val4) { + %cmp = fcmp ogt <4 x double> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4 + ret <4 x i8> %sel + +; CHECK-LABEL: fun89: +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <4 x i16> @fun90(<4 x double> %val1, <4 x double> %val2, + <4 x i16> %val3, <4 x i16> %val4) { + %cmp = fcmp ogt <4 x double> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4 + ret <4 x i16> %sel + +; CHECK-LABEL: fun90: +; CHECK-NOT: vpkg +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <4 x i32> @fun91(<4 x double> %val1, <4 x double> %val2, + <4 x i32> %val3, <4 x i32> %val4) { + %cmp = fcmp ogt <4 x double> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4 + ret <4 x i32> %sel + +; CHECK-LABEL: fun91: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <4 x float> @fun92(<4 x double> %val1, <4 x double> %val2, + <4 x float> %val3, <4 x float> %val4) { + %cmp = fcmp ogt <4 x double> %val1, %val2 + %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4 + ret <4 x float> %sel + +; CHECK-LABEL: fun92: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <8 x i8> @fun93(<8 x float> %val1, <8 x float> %val2, + <8 x i8> %val3, <8 x i8> %val4) { + %cmp = fcmp ogt <8 x float> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4 + ret <8 x i8> %sel + +; CHECK-LABEL: fun93: +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vpkh +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <8 x i16> @fun94(<8 x float> %val1, <8 x float> %val2, + <8 x i16> %val3, <8 x i16> %val4) { + %cmp = fcmp ogt <8 x float> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %sel + +; CHECK-LABEL: fun94: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <8 x i64> @fun95(<8 x float> %val1, <8 x float> %val2, + <8 x i64> %val3, <8 x i64> %val4) { + %cmp = fcmp ogt <8 x float> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4 + ret <8 x i64> %sel + +; CHECK-LABEL: fun95: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <8 x double> @fun96(<8 x float> %val1, <8 x float> %val2, + <8 x double> %val3, <8 x double> %val4) { + %cmp = fcmp ogt <8 x float> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4 + ret <8 x double> %sel + +; CHECK-LABEL: fun96: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <8 x i8> @fun97(<8 x double> %val1, <8 x double> %val2, + <8 x i8> %val3, <8 x i8> %val4) { + %cmp = fcmp ogt <8 x double> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4 + ret <8 x i8> %sel + +; CHECK-LABEL: fun97: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: vpkf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vpkh +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <8 x i16> @fun98(<8 x double> %val1, <8 x double> %val2, + <8 x i16> %val3, <8 x i16> %val4) { + %cmp = fcmp ogt <8 x double> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4 + ret <8 x i16> %sel + +; CHECK-LABEL: fun98: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <8 x i32> @fun99(<8 x double> %val1, <8 x double> %val2, + <8 x i32> %val3, <8 x i32> %val4) { + %cmp = fcmp ogt <8 x double> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4 + ret <8 x i32> %sel + +; CHECK-LABEL: fun99: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <8 x float> @fun100(<8 x double> %val1, <8 x double> %val2, + <8 x float> %val3, <8 x float> %val4) { + %cmp = fcmp ogt <8 x double> %val1, %val2 + %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4 + ret <8 x float> %sel + +; CHECK-LABEL: fun100: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <16 x i8> @fun101(<16 x float> %val1, <16 x float> %val2, + <16 x i8> %val3, <16 x i8> %val4) { + %cmp = fcmp ogt <16 x float> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %sel + +; CHECK-LABEL: fun101: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <16 x i16> @fun102(<16 x float> %val1, <16 x float> %val2, + <16 x i16> %val3, <16 x i16> %val4) { + %cmp = fcmp ogt <16 x float> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4 + ret <16 x i16> %sel + +; CHECK-LABEL: fun102: +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <16 x i64> @fun103(<16 x float> %val1, <16 x float> %val2, + <16 x i64> %val3, <16 x i64> %val4) { + %cmp = fcmp ogt <16 x float> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4 + ret <16 x i64> %sel + +; CHECK-LABEL: fun103: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x double> @fun104(<16 x float> %val1, <16 x float> %val2, + <16 x double> %val3, <16 x double> %val4) { + %cmp = fcmp ogt <16 x float> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4 + ret <16 x double> %sel + +; CHECK-LABEL: fun104: +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +; CHECK-NOT: veslg +; CHECK-NOT: vesrag +} + +define <16 x i8> @fun105(<16 x double> %val1, <16 x double> %val2, + <16 x i8> %val3, <16 x i8> %val4) { + %cmp = fcmp ogt <16 x double> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4 + ret <16 x i8> %sel + +; CHECK-LABEL: fun105: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: vesrah +; CHECK-NOT: veslb +; CHECK-NOT: vesrab +} + +define <16 x i16> @fun106(<16 x double> %val1, <16 x double> %val2, + <16 x i16> %val3, <16 x i16> %val4) { + %cmp = fcmp ogt <16 x double> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4 + ret <16 x i16> %sel + +; CHECK-LABEL: fun106: +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +; CHECK-NOT: veslf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: vesraf +; CHECK-NOT: veslh +; CHECK-NOT: vesrah +} + +define <16 x i32> @fun107(<16 x double> %val1, <16 x double> %val2, + <16 x i32> %val3, <16 x i32> %val4) { + %cmp = fcmp ogt <16 x double> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4 + ret <16 x i32> %sel + +; CHECK-LABEL: fun107: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + +define <16 x float> @fun108(<16 x double> %val1, <16 x double> %val2, + <16 x float> %val3, <16 x float> %val4) { + %cmp = fcmp ogt <16 x double> %val1, %val2 + %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4 + ret <16 x float> %sel + +; CHECK-LABEL: fun108: +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +; CHECK-NOT: veslf +; CHECK-NOT: vesraf +} + Index: test/CodeGen/X86/2011-10-19-widen_vselect.ll =================================================================== --- test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -29,7 +29,6 @@ ; X32-NEXT: cmpordps %xmm0, %xmm0 ; X32-NEXT: pmovsxdq %xmm0, %xmm0 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: pslld $31, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; X32-NEXT: extractps $1, %xmm1, (%eax) ; X32-NEXT: movss %xmm1, (%eax) @@ -41,7 +40,6 @@ ; X64-NEXT: cmpordps %xmm0, %xmm0 ; X64-NEXT: pmovsxdq %xmm0, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: pslld $31, %xmm0 ; X64-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; X64-NEXT: movlps %xmm1, (%rax) ; X64-NEXT: retq @@ -83,10 +81,9 @@ ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: cmpltps %xmm2, %xmm0 ; X32-NEXT: pmovsxdq %xmm0, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: pslld $31, %xmm0 ; X32-NEXT: movaps {{.*#+}} xmm3 = <1,1,u,u> ; X32-NEXT: addps %xmm1, %xmm3 +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X32-NEXT: movaps %xmm1, %xmm4 ; X32-NEXT: blendvps %xmm0, %xmm3, %xmm4 ; X32-NEXT: cmpeqps %xmm2, %xmm1 Index: test/CodeGen/X86/2011-10-21-widen-cmp.ll =================================================================== --- test/CodeGen/X86/2011-10-21-widen-cmp.ll +++ test/CodeGen/X86/2011-10-21-widen-cmp.ll @@ -11,7 +11,6 @@ ; CHECK-NEXT: cmpordps %xmm0, %xmm0 ; CHECK-NEXT: pmovsxdq %xmm0, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: pslld $31, %xmm0 ; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; CHECK-NEXT: movlps %xmm1, (%rax) ; CHECK-NEXT: retq Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -519,72 +519,68 @@ ; SSE2-LABEL: test13: ; SSE2: ## BB#0: ## %vector.ph ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu (%rsi), %xmm2 -; SSE2-NEXT: movdqu 16(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: movdqu 16(%rsi), %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE2-NEXT: psllw $15, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: movdqu %xmm4, (%rdi) +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movdqu %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test13: ; SSSE3: ## BB#0: ## %vector.ph ; SSSE3-NEXT: movdqu (%rdi), %xmm0 -; SSSE3-NEXT: movdqu (%rsi), %xmm2 -; SSSE3-NEXT: movdqu 16(%rsi), %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqu (%rsi), %xmm1 +; SSSE3-NEXT: movdqu 16(%rsi), %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: psubd %xmm3, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufb %xmm5, %xmm6 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSSE3-NEXT: psllw $15, %xmm6 -; SSSE3-NEXT: psraw $15, %xmm6 -; SSSE3-NEXT: psubd %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm5, %xmm0 -; SSSE3-NEXT: pshufb %xmm5, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: pandn %xmm1, %xmm6 -; SSSE3-NEXT: movdqu %xmm6, (%rdi) +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: pshufb %xmm2, %xmm5 +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: movdqu %xmm1, (%rdi) ; SSSE3-NEXT: retq ; ; AVX1-LABEL: test13: @@ -648,145 +644,118 @@ define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind { ; SSE2-LABEL: test14: ; SSE2: ## BB#0: ## %vector.ph -; SSE2-NEXT: movdqu (%rdi), %xmm1 -; SSE2-NEXT: movdqu (%rsi), %xmm8 -; SSE2-NEXT: movdqu 16(%rsi), %xmm9 -; SSE2-NEXT: movdqu 32(%rsi), %xmm10 -; SSE2-NEXT: movdqu 48(%rsi), %xmm6 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu (%rsi), %xmm7 +; SSE2-NEXT: movdqu 16(%rsi), %xmm6 +; SSE2-NEXT: movdqu 32(%rsi), %xmm9 +; SSE2-NEXT: movdqu 48(%rsi), %xmm8 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psubd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psubd %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: psubd %xmm10, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm10 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE2-NEXT: psllw $15, %xmm7 -; SSE2-NEXT: psraw $15, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: movdqa %xmm8, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE2-NEXT: psllw $15, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: packuswb %xmm7, %xmm4 -; SSE2-NEXT: psllw $7, %xmm4 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm11 +; SSE2-NEXT: psubd %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm9 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE2-NEXT: psubd %xmm8, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: packuswb %xmm1, %xmm5 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm11 -; SSE2-NEXT: movdqu %xmm11, (%rdi) +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: packuswb %xmm5, %xmm9 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: packuswb %xmm6, %xmm7 +; SSE2-NEXT: packuswb %xmm9, %xmm7 +; SSE2-NEXT: pandn %xmm0, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test14: ; SSSE3: ## BB#0: ## %vector.ph -; SSSE3-NEXT: movdqu (%rdi), %xmm1 -; SSSE3-NEXT: movdqu (%rsi), %xmm8 -; SSSE3-NEXT: movdqu 16(%rsi), %xmm9 -; SSSE3-NEXT: movdqu 32(%rsi), %xmm10 -; SSSE3-NEXT: movdqu 48(%rsi), %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqu (%rdi), %xmm0 +; SSSE3-NEXT: movdqu (%rsi), %xmm7 +; SSSE3-NEXT: movdqu 16(%rsi), %xmm6 +; SSSE3-NEXT: movdqu 32(%rsi), %xmm9 +; SSSE3-NEXT: movdqu 48(%rsi), %xmm8 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: psubd %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: psubd %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: psubd %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm6, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm11, %xmm4 -; SSSE3-NEXT: movdqa %xmm7, %xmm5 -; SSSE3-NEXT: psubd %xmm10, %xmm7 -; SSSE3-NEXT: pxor %xmm6, %xmm10 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pshufb %xmm11, %xmm10 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm4[0] -; SSSE3-NEXT: psllw $15, %xmm10 -; SSSE3-NEXT: psraw $15, %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm4, %xmm10 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: psubd %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm9 -; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: psubd %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm9 +; SSSE3-NEXT: pxor %xmm2, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm9 -; SSSE3-NEXT: pshufb %xmm11, %xmm9 ; SSSE3-NEXT: movdqa %xmm8, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufb %xmm11, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] -; SSSE3-NEXT: psllw $15, %xmm5 -; SSSE3-NEXT: psraw $15, %xmm5 -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm10[0] -; SSSE3-NEXT: psllw $7, %xmm5 -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm5 -; SSSE3-NEXT: pcmpgtb %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 ; SSSE3-NEXT: psubd %xmm8, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: packuswb %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: packuswb %xmm2, %xmm3 -; SSSE3-NEXT: packuswb %xmm7, %xmm3 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: movdqu %xmm0, (%rdi) +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: packuswb %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm1, %xmm9 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSSE3-NEXT: andnpd %xmm0, %xmm9 +; SSSE3-NEXT: movupd %xmm9, (%rdi) ; SSSE3-NEXT: retq ; ; AVX1-LABEL: test14: @@ -805,23 +774,17 @@ ; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm7 ; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm7 +; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm11 +; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm4 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpcmpgtd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vpsubd %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm4 ; AVX1-NEXT: vpsubd %xmm1, %xmm10, %xmm1 ; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] @@ -832,8 +795,9 @@ ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -850,29 +814,25 @@ ; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpacksswb %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm7 +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpcmpgtd %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-NEXT: vpacksswb %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-NEXT: vpsllw $7, %xmm4, %xmm4 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vpsubd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -896,72 +856,68 @@ ; SSE2-LABEL: test15: ; SSE2: ## BB#0: ## %vector.ph ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu (%rsi), %xmm2 -; SSE2-NEXT: movdqu 16(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: movdqu 16(%rsi), %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE2-NEXT: psllw $15, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rdi) +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test15: ; SSSE3: ## BB#0: ## %vector.ph ; SSSE3-NEXT: movdqu (%rdi), %xmm0 -; SSSE3-NEXT: movdqu (%rsi), %xmm2 -; SSSE3-NEXT: movdqu 16(%rsi), %xmm4 +; SSSE3-NEXT: movdqu (%rsi), %xmm1 +; SSSE3-NEXT: movdqu 16(%rsi), %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: psubd %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSSE3-NEXT: psllw $15, %xmm3 -; SSSE3-NEXT: psraw $15, %xmm3 -; SSSE3-NEXT: psubd %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: movdqu %xmm1, (%rdi) +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSSE3-NEXT: pand %xmm0, %xmm5 +; SSSE3-NEXT: movdqu %xmm5, (%rdi) ; SSSE3-NEXT: retq ; ; AVX1-LABEL: test15: @@ -1026,72 +982,68 @@ ; SSE2-LABEL: test16: ; SSE2: ## BB#0: ## %vector.ph ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu (%rsi), %xmm2 -; SSE2-NEXT: movdqu 16(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: movdqu 16(%rsi), %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE2-NEXT: psllw $15, %xmm4 -; SSE2-NEXT: psraw $15, %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rdi) +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test16: ; SSSE3: ## BB#0: ## %vector.ph ; SSSE3-NEXT: movdqu (%rdi), %xmm0 -; SSSE3-NEXT: movdqu (%rsi), %xmm2 -; SSSE3-NEXT: movdqu 16(%rsi), %xmm4 +; SSSE3-NEXT: movdqu (%rsi), %xmm1 +; SSSE3-NEXT: movdqu 16(%rsi), %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: psubd %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSSE3-NEXT: psllw $15, %xmm3 -; SSSE3-NEXT: psraw $15, %xmm3 -; SSSE3-NEXT: psubd %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: movdqu %xmm1, (%rdi) +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSSE3-NEXT: pand %xmm0, %xmm5 +; SSSE3-NEXT: movdqu %xmm5, (%rdi) ; SSSE3-NEXT: retq ; ; AVX1-LABEL: test16: