Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -881,7 +881,7 @@ // Calculate the overflow flag: zero extend the arithmetic result from // the original type. - SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT); + SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT.getScalarType()); // Overflowed if and only if this is not equal to Res. Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -674,6 +674,7 @@ SDValue ScalarizeVecRes_TernaryOp(SDNode *N); SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_StrictFPOp(SDNode *N); + SDValue ScalarizeVecRes_OverflowOp(SDNode *N, unsigned ResNo); SDValue ScalarizeVecRes_InregOp(SDNode *N); SDValue ScalarizeVecRes_VecInregOp(SDNode *N); @@ -728,6 +729,8 @@ void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo, + SDValue &Lo, SDValue &Hi); void SplitVecRes_MULFIX(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -809,6 +812,7 @@ SDValue WidenVecRes_Binary(SDNode *N); SDValue WidenVecRes_BinaryCanTrap(SDNode *N); SDValue WidenVecRes_StrictFP(SDNode *N); + SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo); SDValue WidenVecRes_Convert(SDNode *N); SDValue WidenVecRes_FCOPYSIGN(SDNode *N); SDValue WidenVecRes_POWI(SDNode *N); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -171,6 +171,14 @@ case ISD::STRICT_FTRUNC: R = ScalarizeVecRes_StrictFPOp(N); break; + case ISD::UADDO: + case ISD::SADDO: + case ISD::USUBO: + case ISD::SSUBO: + case ISD::UMULO: + case ISD::SMULO: + R = ScalarizeVecRes_OverflowOp(N, ResNo); + break; case ISD::SMULFIX: case ISD::UMULFIX: R = ScalarizeVecRes_MULFIX(N); @@ -235,6 +243,43 @@ return Result; } +SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N, + unsigned ResNo) { + SDLoc DL(N); + EVT ResVT = N->getValueType(0); + EVT OvVT = N->getValueType(1); + + SDValue ScalarLHS, ScalarRHS; + if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) { + ScalarLHS = GetScalarizedVector(N->getOperand(0)); + ScalarRHS = GetScalarizedVector(N->getOperand(1)); + } else { + SmallVector ElemsLHS, ElemsRHS; + DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS); + DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS); + ScalarLHS = ElemsLHS[0]; + ScalarRHS = ElemsRHS[0]; + } + + SDVTList ScalarVTs = DAG.getVTList( + ResVT.getVectorElementType(), OvVT.getVectorElementType()); + SDNode *ScalarNode = DAG.getNode( + N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode(); + + // Replace the other vector result not being explicitly scalarized here. + unsigned OtherNo = 1 - ResNo; + EVT OtherVT = N->getValueType(OtherNo); + if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) { + SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo)); + } else { + SDValue OtherVal = DAG.getNode( + ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo)); + ReplaceValueWith(SDValue(N, OtherNo), OtherVal); + } + + return SDValue(ScalarNode, ResNo); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); @@ -859,6 +904,14 @@ case ISD::STRICT_FTRUNC: SplitVecRes_StrictFPOp(N, Lo, Hi); break; + case ISD::UADDO: + case ISD::SADDO: + case ISD::USUBO: + case ISD::SSUBO: + case ISD::UMULO: + case ISD::SMULO: + SplitVecRes_OverflowOp(N, ResNo, Lo, Hi); + break; case ISD::SMULFIX: case ISD::UMULFIX: SplitVecRes_MULFIX(N, Lo, Hi); @@ -1205,6 +1258,47 @@ ReplaceValueWith(SDValue(N, 1), Chain); } +void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo, + SDValue &Lo, SDValue &Hi) { + SDLoc dl(N); + EVT ResVT = N->getValueType(0); + EVT OvVT = N->getValueType(1); + EVT LoResVT, HiResVT, LoOvVT, HiOvVT; + std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT); + std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT); + + SDValue LoLHS, HiLHS, LoRHS, HiRHS; + if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) { + GetSplitVector(N->getOperand(0), LoLHS, HiLHS); + GetSplitVector(N->getOperand(1), LoRHS, HiRHS); + } else { + std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0); + std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1); + } + + unsigned Opcode = N->getOpcode(); + SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT); + SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT); + SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode(); + SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode(); + + Lo = SDValue(LoNode, ResNo); + Hi = SDValue(HiNode, ResNo); + + // Replace the other vector result not being explicitly split here. + unsigned OtherNo = 1 - ResNo; + EVT OtherVT = N->getValueType(OtherNo); + if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) { + SetSplitVector(SDValue(N, OtherNo), + SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo)); + } else { + SDValue OtherVal = DAG.getNode( + ISD::CONCAT_VECTORS, dl, OtherVT, + SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo)); + ReplaceValueWith(SDValue(N, OtherNo), OtherVal); + } +} + void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Vec = N->getOperand(0); @@ -2471,6 +2565,15 @@ Res = WidenVecRes_StrictFP(N); break; + case ISD::UADDO: + case ISD::SADDO: + case ISD::USUBO: + case ISD::SSUBO: + case ISD::UMULO: + case ISD::SMULO: + Res = WidenVecRes_OverflowOp(N, ResNo); + break; + case ISD::FCOPYSIGN: Res = WidenVecRes_FCOPYSIGN(N); break; @@ -2845,6 +2948,58 @@ return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT); } +SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) { + SDLoc DL(N); + EVT ResVT = N->getValueType(0); + EVT OvVT = N->getValueType(1); + EVT WideResVT, WideOvVT; + SDValue WideLHS, WideRHS; + + // TODO: This might result in a widen/split loop. + if (ResNo == 0) { + WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT); + WideOvVT = EVT::getVectorVT( + *DAG.getContext(), OvVT.getVectorElementType(), + WideResVT.getVectorNumElements()); + + WideLHS = GetWidenedVector(N->getOperand(0)); + WideRHS = GetWidenedVector(N->getOperand(1)); + } else { + WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT); + WideResVT = EVT::getVectorVT( + *DAG.getContext(), ResVT.getVectorElementType(), + WideOvVT.getVectorNumElements()); + + SDValue Zero = DAG.getConstant( + 0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())); + WideLHS = DAG.getNode( + ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT), + N->getOperand(0), Zero); + WideRHS = DAG.getNode( + ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT), + N->getOperand(1), Zero); + } + + SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT); + SDNode *WideNode = DAG.getNode( + N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode(); + + // Replace the other vector result not being explicitly widened here. + unsigned OtherNo = 1 - ResNo; + EVT OtherVT = N->getValueType(OtherNo); + if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) { + SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo)); + } else { + SDValue Zero = DAG.getConstant( + 0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())); + SDValue OtherVal = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero); + ReplaceValueWith(SDValue(N, OtherNo), OtherVal); + } + + return SDValue(WideNode, ResNo); +} + SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { SDValue InOp = N->getOperand(0); SDLoc DL(N); Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6113,7 +6113,13 @@ SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); - SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1); + EVT ResultVT = Op1.getValueType(); + EVT OverflowVT = MVT::i1; + if (ResultVT.isVector()) + OverflowVT = EVT::getVectorVT( + *Context, OverflowVT, ResultVT.getVectorNumElements()); + + SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT); setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2)); return nullptr; } Index: llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll +++ llvm/trunk/test/CodeGen/AArch64/vec_uaddo.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.2s, v0.2s, v1.2s +; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT: str s1, [x0] +; CHECK-NEXT: ret + %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) + %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 + %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 + %res = sext <1 x i1> %obit to <1 x i32> + store <1 x i32> %val, <1 x i32>* %p2 + ret <1 x i32> %res +} + +define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.2s, v0.2s, v1.2s +; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) + %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i32> %val, <2 x i32>* %p2 + ret <2 x i32> %res +} + +define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: add x8, x0, #8 // =8 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: st1 { v1.s }[2], [x8] +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ret + %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) + %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 + %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 + %res = sext <3 x i1> %obit to <3 x i32> + store <3 x i32> %val, <3 x i32>* %p2 + ret <3 x i32> %res +} + +define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) + %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i32> %val, <4 x i32>* %p2 + ret <4 x i32> %res +} + +define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v6i32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w6 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov v0.s[1], w7 +; CHECK-NEXT: ldr s2, [sp, #16] +; CHECK-NEXT: ld1 { v0.s }[2], [x8] +; CHECK-NEXT: add x9, sp, #8 // =8 +; CHECK-NEXT: add x10, sp, #24 // =24 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: mov v1.s[1], w1 +; CHECK-NEXT: fmov s3, w4 +; CHECK-NEXT: ldr x11, [sp, #32] +; CHECK-NEXT: mov v1.s[2], w2 +; CHECK-NEXT: mov v3.s[1], w5 +; CHECK-NEXT: mov v1.s[3], w3 +; CHECK-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s +; CHECK-NEXT: cmhi v1.4s, v1.4s, v0.4s +; CHECK-NEXT: str d2, [x11, #16] +; CHECK-NEXT: xtn v2.4h, v3.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: mov w5, v2.s[1] +; CHECK-NEXT: mov w1, v1.s[1] +; CHECK-NEXT: mov w2, v1.s[2] +; CHECK-NEXT: mov w3, v1.s[3] +; CHECK-NEXT: fmov w4, s2 +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: str q0, [x11] +; CHECK-NEXT: ret + %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) + %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 + %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 + %res = sext <6 x i1> %obit to <6 x i32> + store <6 x i32> %val, <6 x i32>* %p2 + ret <6 x i32> %res +} + +define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; CHECK-LABEL: uaddo_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: cmhi v1.4s, v1.4s, v3.4s +; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: ret + %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) + %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i32> %val, <8 x i32>* %p2 + ret <8 x i32> %res +} + +define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; CHECK-LABEL: uaddo_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: add v4.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhi v0.16b, v0.16b, v4.16b +; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b +; CHECK-NEXT: zip2 v2.8b, v0.8b, v0.8b +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: zip1 v3.8b, v0.8b, v0.8b +; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-NEXT: shl v2.4s, v2.4s, #31 +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll v5.4s, v0.4h, #0 +; CHECK-NEXT: sshr v0.4s, v1.4s, #31 +; CHECK-NEXT: sshr v1.4s, v2.4s, #31 +; CHECK-NEXT: shl v2.4s, v3.4s, #31 +; CHECK-NEXT: shl v3.4s, v5.4s, #31 +; CHECK-NEXT: sshr v2.4s, v2.4s, #31 +; CHECK-NEXT: sshr v3.4s, v3.4s, #31 +; CHECK-NEXT: str q4, [x0] +; CHECK-NEXT: ret + %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) + %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i8> %val, <16 x i8>* %p2 + ret <16 x i32> %res +} + +define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; CHECK-LABEL: uaddo_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add v2.8h, v0.8h, v1.8h +; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b +; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-NEXT: shl v3.4s, v0.4s, #31 +; CHECK-NEXT: sshr v0.4s, v1.4s, #31 +; CHECK-NEXT: sshr v1.4s, v3.4s, #31 +; CHECK-NEXT: str q2, [x0] +; CHECK-NEXT: ret + %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i16> %val, <8 x i16>* %p2 + ret <8 x i32> %res +} + +define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; CHECK-LABEL: uaddo_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.2d, v0.2d, v1.2d +; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: ret + %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) + %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i64> %val, <2 x i64>* %p2 + ret <2 x i32> %res +} + +define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; CHECK-LABEL: uaddo_v4i24: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v1.4s, #255, lsl #24 +; CHECK-NEXT: bic v0.4s, #255, lsl #24 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: bic v1.4s, #255, lsl #24 +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w10, v0.s[1] +; CHECK-NEXT: sturh w8, [x0, #9] +; CHECK-NEXT: lsr w8, w8, #16 +; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: strh w9, [x0, #6] +; CHECK-NEXT: sturh w10, [x0, #3] +; CHECK-NEXT: lsr w9, w9, #16 +; CHECK-NEXT: lsr w10, w10, #16 +; CHECK-NEXT: strb w8, [x0, #11] +; CHECK-NEXT: mvn v0.16b, v1.16b +; CHECK-NEXT: lsr w8, w11, #16 +; CHECK-NEXT: strh w11, [x0] +; CHECK-NEXT: strb w9, [x0, #8] +; CHECK-NEXT: strb w10, [x0, #5] +; CHECK-NEXT: strb w8, [x0, #2] +; CHECK-NEXT: ret + %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) + %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i24> %val, <4 x i24>* %p2 + ret <4 x i32> %res +} + +define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; CHECK-LABEL: uaddo_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4h, #1 +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: add v1.4h, v0.4h, v1.4h +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: and v0.8b, v1.8b, v2.8b +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: cmeq v0.4h, v0.4h, v1.4h +; CHECK-NEXT: bfi w8, w9, #2, #1 +; CHECK-NEXT: umov w9, v1.h[3] +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: bfi w8, w9, #3, #29 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) + %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i1> %val, <4 x i1>* %p2 + ret <4 x i32> %res +} + +define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; CHECK-LABEL: uaddo_v2i128: +; CHECK: // %bb.0: +; CHECK-NEXT: adds x9, x2, x6 +; CHECK-NEXT: adcs x10, x3, x7 +; CHECK-NEXT: cmp x9, x2 +; CHECK-NEXT: cset w11, lo +; CHECK-NEXT: cmp x10, x3 +; CHECK-NEXT: cset w12, lo +; CHECK-NEXT: csel w11, w11, w12, eq +; CHECK-NEXT: adds x12, x0, x4 +; CHECK-NEXT: adcs x13, x1, x5 +; CHECK-NEXT: cmp x12, x0 +; CHECK-NEXT: cset w14, lo +; CHECK-NEXT: cmp x13, x1 +; CHECK-NEXT: cset w15, lo +; CHECK-NEXT: csel w14, w14, w15, eq +; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: mov v0.s[1], w11 +; CHECK-NEXT: shl v0.2s, v0.2s, #31 +; CHECK-NEXT: sshr v0.2s, v0.2s, #31 +; CHECK-NEXT: stp x9, x10, [x8, #16] +; CHECK-NEXT: stp x12, x13, [x8] +; CHECK-NEXT: ret + %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) + %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i128> %val, <2 x i128>* %p2 + ret <2 x i32> %res +} Index: llvm/trunk/test/CodeGen/AMDGPU/saddo.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/saddo.ll +++ llvm/trunk/test/CodeGen/AMDGPU/saddo.ll @@ -1,11 +1,14 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s + declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone + +declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + ; FUNC-LABEL: {{^}}saddo_i64_zext: define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -65,3 +68,22 @@ store i1 %carry, i1 addrspace(1)* %carryout ret void } + +; FUNC-LABEL: {{^}}v_saddo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ssubo.ll @@ -1,10 +1,11 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,SI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs< %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s + declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone +declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone ; FUNC-LABEL: {{^}}ssubo_i64_zext: define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { @@ -70,3 +71,22 @@ store i1 %carry, i1 addrspace(1)* %carryout ret void } + +; FUNC-LABEL: {{^}}v_ssubo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_sub_{{[iu]}}32 +define amdgpu_kernel void @v_ssubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll +++ llvm/trunk/test/CodeGen/AMDGPU/uaddo.ll @@ -1,7 +1,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_uaddo_i64_zext: ; GCN: s_add_u32 @@ -152,10 +151,32 @@ ret void } +; FUNC-LABEL: {{^}}v_uaddo_v2i32: +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_cmp_lt_i32 +; SICIVI: v_add_{{[iu]}}32 +define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 +declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: llvm/trunk/test/CodeGen/AMDGPU/usubo.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/usubo.ll +++ llvm/trunk/test/CodeGen/AMDGPU/usubo.ll @@ -1,7 +1,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s + ; FUNC-LABEL: {{^}}s_usubo_i64_zext: ; GCN: s_sub_u32 @@ -159,10 +159,28 @@ ret void } +; FUNC-LABEL: {{^}}v_usubo_v2i32: +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cndmask_b32 +; SICIVI: v_sub_{{[iu]}}32 +; SICIVI: v_cndmask_b32 +define amdgpu_kernel void @v_usubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 + %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 + %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind + %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 + %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 + store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4 + %carry.ext = zext <2 x i1> %carry to <2 x i32> + store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1 +declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: llvm/trunk/test/CodeGen/X86/vec_saddo.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_saddo.ll +++ llvm/trunk/test/CodeGen/X86/vec_saddo.ll @@ -0,0 +1,2028 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v1i32: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: addl %esi, %edi +; SSE-NEXT: seto %al +; SSE-NEXT: negl %eax +; SSE-NEXT: movl %edi, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: saddo_v1i32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: addl %esi, %edi +; AVX-NEXT: seto %al +; AVX-NEXT: negl %eax +; AVX-NEXT: movl %edi, (%rdx) +; AVX-NEXT: retq + %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) + %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 + %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 + %res = sext <1 x i1> %obit to <1 x i32> + store <1 x i32> %val, <1 x i32>* %p2 + ret <1 x i32> %res +} + +define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: saddo_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: paddq %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: paddq %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1 +; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT: vpmovqd %xmm0, (%rdi) +; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) + %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i32> %val, <2 x i32>* %p2 + ret <2 x i32> %res +} + +define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: saddo_v3i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v3i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, 8(%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v3i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pandn %xmm3, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v3i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v3i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v3i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) + %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 + %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 + %res = sext <3 x i1> %obit to <3 x i32> + store <3 x i32> %val, <3 x i32>* %p2 + ret <3 x i32> %res +} + +define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: saddo_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) + %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i32> %val, <4 x i32>* %p2 + ret <4 x i32> %res +} + +define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: saddo_v6i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: movq %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movq %xmm2, 16(%rdi) +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v6i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: paddd %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm4 +; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: paddd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSSE3-NEXT: movq %xmm1, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm0, (%rcx) +; SSSE3-NEXT: movq %xmm2, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v6i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: movd %esi, %xmm4 +; SSE41-NEXT: pinsrd $1, %edx, %xmm4 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm4 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm4 +; SSE41-NEXT: movd %r9d, %xmm2 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: paddd %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE41-NEXT: pandn %xmm6, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: paddd %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT: pandn %xmm6, %xmm3 +; SSE41-NEXT: movq %xmm0, 16(%rcx) +; SSE41-NEXT: movdqa %xmm1, (%rcx) +; SSE41-NEXT: movq %xmm3, 16(%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v6i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vmovq %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v6i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovq %xmm2, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v6i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovq %xmm2, 16(%rdi) +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) + %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 + %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 + %res = sext <6 x i1> %obit to <6 x i32> + store <6 x i32> %val, <6 x i32>* %p2 + ret <6 x i32> %res +} + +define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE-NEXT: pxor %xmm5, %xmm7 +; SSE-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm5, %xmm2 +; SSE-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE-NEXT: pxor %xmm5, %xmm7 +; SSE-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE-NEXT: paddd %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm4, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: saddo_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm9 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm4, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) + %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i32> %val, <8 x i32>* %p2 + ret <8 x i32> %res +} + +define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE-LABEL: saddo_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: pcmpgtd %xmm4, %xmm11 +; SSE-NEXT: pcmpeqd %xmm10, %xmm10 +; SSE-NEXT: pxor %xmm10, %xmm11 +; SSE-NEXT: pxor %xmm12, %xmm12 +; SSE-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE-NEXT: pxor %xmm10, %xmm12 +; SSE-NEXT: pcmpeqd %xmm12, %xmm11 +; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE-NEXT: pxor %xmm10, %xmm9 +; SSE-NEXT: pcmpeqd %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: pxor %xmm12, %xmm12 +; SSE-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE-NEXT: pxor %xmm10, %xmm12 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm10, %xmm4 +; SSE-NEXT: pcmpeqd %xmm4, %xmm12 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE-NEXT: pxor %xmm10, %xmm11 +; SSE-NEXT: pcmpeqd %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm12, %xmm11 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm10, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE-NEXT: pxor %xmm10, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm10, %xmm6 +; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE-NEXT: pxor %xmm10, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE-NEXT: pxor %xmm10, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE-NEXT: paddd %xmm7, %xmm8 +; SSE-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE-NEXT: pxor %xmm10, %xmm3 +; SSE-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm8, 48(%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: retq +; +; AVX1-LABEL: saddo_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm10 +; AVX1-NEXT: vpcmpeqd %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm11 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm11, %xmm6, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX1-NEXT: vpaddd %xmm9, %xmm7, %xmm9 +; AVX1-NEXT: vpcmpgtd %xmm9, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm10, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm10 +; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm11 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm12 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm4 +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmovsxwd %xmm8, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm3, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7 +; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpandn %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7 +; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0 +; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vpandn %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm2, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0 +; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) + %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i32> %val, <16 x i32>* %p2 + ret <16 x i32> %res +} + +define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: saddo_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqb %xmm5, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqb %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtb %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqb %xmm5, %xmm2 +; SSSE3-NEXT: paddb %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm2 +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa %xmm6, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm6 +; AVX2-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %xmm6, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltb %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) + %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i8> %val, <16 x i8>* %p2 + ret <16 x i32> %res +} + +define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: saddo_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqw %xmm5, %xmm3 +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqw %xmm5, %xmm3 +; SSSE3-NEXT: paddw %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpgtw %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtw %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqw %xmm5, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE41-NEXT: pandn %xmm3, %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltw %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i16> %val, <8 x i16>* %p2 + ret <8 x i32> %res +} + +define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE2-LABEL: saddo_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v2i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 +; SSE41-NEXT: pandn %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) + %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i64> %val, <2 x i64>* %p2 + ret <2 x i32> %res +} + +define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: saddo_v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: pslld $8, %xmm2 +; SSE2-NEXT: psrad $8, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: psrad $8, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movw %ax, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movw %cx, 9(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movw %dx, 6(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movw %si, 3(%rdi) +; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 11(%rdi) +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 8(%rdi) +; SSE2-NEXT: shrl $16, %esi +; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: psrad $8, %xmm1 +; SSSE3-NEXT: pslld $8, %xmm2 +; SSSE3-NEXT: psrad $8, %xmm2 +; SSSE3-NEXT: paddd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: psrad $8, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movw %ax, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movw %cx, 9(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movw %dx, 6(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movw %si, 3(%rdi) +; SSSE3-NEXT: shrl $16, %eax +; SSSE3-NEXT: movb %al, 2(%rdi) +; SSSE3-NEXT: shrl $16, %ecx +; SSSE3-NEXT: movb %cl, 11(%rdi) +; SSSE3-NEXT: shrl $16, %edx +; SSSE3-NEXT: movb %dl, 8(%rdi) +; SSSE3-NEXT: shrl $16, %esi +; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $8, %xmm1 +; SSE41-NEXT: psrad $8, %xmm1 +; SSE41-NEXT: pslld $8, %xmm2 +; SSE41-NEXT: psrad $8, %xmm2 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: psrad $8, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm2, %eax +; SSE41-NEXT: movw %ax, 9(%rdi) +; SSE41-NEXT: pextrd $2, %xmm2, %ecx +; SSE41-NEXT: movw %cx, 6(%rdi) +; SSE41-NEXT: pextrd $1, %xmm2, %edx +; SSE41-NEXT: movw %dx, 3(%rdi) +; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: movw %si, (%rdi) +; SSE41-NEXT: shrl $16, %eax +; SSE41-NEXT: movb %al, 11(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 8(%rdi) +; SSE41-NEXT: shrl $16, %edx +; SSE41-NEXT: movb %dl, 5(%rdi) +; SSE41-NEXT: shrl $16, %esi +; SSE41-NEXT: movb %sil, 2(%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v4i24: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $8, %xmm1, %xmm1 +; AVX1-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpslld $8, %xmm1, %xmm0 +; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: movw %ax, 9(%rdi) +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %cx, 6(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm1, %edx +; AVX1-NEXT: movw %dx, 3(%rdi) +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: movw %si, (%rdi) +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: movb %al, 11(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 8(%rdi) +; AVX1-NEXT: shrl $16, %edx +; AVX1-NEXT: movb %dl, 5(%rdi) +; AVX1-NEXT: shrl $16, %esi +; AVX1-NEXT: movb %sil, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v4i24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $8, %xmm1, %xmm1 +; AVX2-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpslld $8, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: movw %ax, 9(%rdi) +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 6(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, %edx +; AVX2-NEXT: movw %dx, 3(%rdi) +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: movw %si, (%rdi) +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: movb %al, 11(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 8(%rdi) +; AVX2-NEXT: shrl $16, %edx +; AVX2-NEXT: movb %dl, 5(%rdi) +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: movb %sil, 2(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v4i24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1 +; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 +; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: movw %ax, 9(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %cx, 6(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %edx +; AVX512-NEXT: movw %dx, 3(%rdi) +; AVX512-NEXT: vmovd %xmm1, %esi +; AVX512-NEXT: movw %si, (%rdi) +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: movb %al, 11(%rdi) +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movb %cl, 8(%rdi) +; AVX512-NEXT: shrl $16, %edx +; AVX512-NEXT: movb %dl, 5(%rdi) +; AVX512-NEXT: shrl $16, %esi +; AVX512-NEXT: movb %sil, 2(%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) + %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i24> %val, <4 x i24>* %p2 + ret <4 x i32> %res +} + +define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: saddo_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: movb %al, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: saddo_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: movb %al, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k3 +; AVX512-NEXT: kxorw %k2, %k0, %k0 +; AVX512-NEXT: kxorw %k0, %k1, %k1 +; AVX512-NEXT: kandnw %k3, %k1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) + %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i1> %val, <4 x i1>* %p2 + ret <4 x i32> %res +} + +define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: saddo_v2i128: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: adcq %r11, %rax +; SSE2-NEXT: setns %bl +; SSE2-NEXT: testq %rcx, %rcx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: setne %bpl +; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: setns %bl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: sete %cl +; SSE2-NEXT: andb %bpl, %cl +; SSE2-NEXT: movzbl %cl, %ebp +; SSE2-NEXT: testq %r9, %r9 +; SSE2-NEXT: setns %bl +; SSE2-NEXT: testq %rsi, %rsi +; SSE2-NEXT: setns %cl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: sete %r11b +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: adcq %r9, %rsi +; SSE2-NEXT: setns %bl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: setne %cl +; SSE2-NEXT: andb %r11b, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pinsrw $4, %ebp, %xmm0 +; SSE2-NEXT: movq %rdx, 16(%r10) +; SSE2-NEXT: movq %rdi, (%r10) +; SSE2-NEXT: movq %rax, 24(%r10) +; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: saddo_v2i128: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: adcq %r11, %rax +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: testq %rcx, %rcx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: setne %bpl +; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: sete %cl +; SSSE3-NEXT: andb %bpl, %cl +; SSSE3-NEXT: movzbl %cl, %ebp +; SSSE3-NEXT: testq %r9, %r9 +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: testq %rsi, %rsi +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: sete %r11b +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: adcq %r9, %rsi +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: setne %cl +; SSSE3-NEXT: andb %r11b, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0 +; SSSE3-NEXT: movq %rdx, 16(%r10) +; SSSE3-NEXT: movq %rdi, (%r10) +; SSSE3-NEXT: movq %rax, 24(%r10) +; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: psllq $63, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: saddo_v2i128: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: adcq %r11, %rax +; SSE41-NEXT: setns %bl +; SSE41-NEXT: testq %rcx, %rcx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setne %bpl +; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: setns %bl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: sete %cl +; SSE41-NEXT: andb %bpl, %cl +; SSE41-NEXT: movzbl %cl, %ebp +; SSE41-NEXT: testq %r9, %r9 +; SSE41-NEXT: setns %bl +; SSE41-NEXT: testq %rsi, %rsi +; SSE41-NEXT: setns %cl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: sete %r11b +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: adcq %r9, %rsi +; SSE41-NEXT: setns %bl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setne %cl +; SSE41-NEXT: andb %r11b, %cl +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%r10) +; SSE41-NEXT: movq %rdi, (%r10) +; SSE41-NEXT: movq %rax, 24(%r10) +; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: saddo_v2i128: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: adcq %r11, %rax +; AVX1-NEXT: setns %bl +; AVX1-NEXT: testq %rcx, %rcx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setne %bpl +; AVX1-NEXT: testq %r11, %r11 +; AVX1-NEXT: setns %bl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: sete %cl +; AVX1-NEXT: andb %bpl, %cl +; AVX1-NEXT: movzbl %cl, %ebp +; AVX1-NEXT: testq %r9, %r9 +; AVX1-NEXT: setns %bl +; AVX1-NEXT: testq %rsi, %rsi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: sete %r11b +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: adcq %r9, %rsi +; AVX1-NEXT: setns %bl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setne %cl +; AVX1-NEXT: andb %r11b, %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, 16(%r10) +; AVX1-NEXT: movq %rdi, (%r10) +; AVX1-NEXT: movq %rax, 24(%r10) +; AVX1-NEXT: movq %rsi, 8(%r10) +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: saddo_v2i128: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: adcq %r11, %rax +; AVX2-NEXT: setns %bl +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setne %bpl +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: setns %bl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: sete %cl +; AVX2-NEXT: andb %bpl, %cl +; AVX2-NEXT: movzbl %cl, %ebp +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: setns %bl +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: sete %r11b +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: adcq %r9, %rsi +; AVX2-NEXT: setns %bl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setne %cl +; AVX2-NEXT: andb %r11b, %cl +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, 16(%r10) +; AVX2-NEXT: movq %rdi, (%r10) +; AVX2-NEXT: movq %rax, 24(%r10) +; AVX2-NEXT: movq %rsi, 8(%r10) +; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: saddo_v2i128: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq %rcx, %r14 +; AVX512-NEXT: adcq %r11, %r14 +; AVX512-NEXT: setns %bl +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %bl, %cl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: testq %r11, %r11 +; AVX512-NEXT: setns %al +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: sete %al +; AVX512-NEXT: andb %bl, %al +; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: setns %al +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: sete %al +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: adcq %r9, %rsi +; AVX512-NEXT: setns %bl +; AVX512-NEXT: cmpb %bl, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: andb %al, %cl +; AVX512-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT: movq %rdx, 16(%r10) +; AVX512-NEXT: movq %rdi, (%r10) +; AVX512-NEXT: movq %r14, 24(%r10) +; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: retq + %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) + %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i128> %val, <2 x i128>* %p2 + ret <2 x i32> %res +} Index: llvm/trunk/test/CodeGen/X86/vec_ssubo.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_ssubo.ll +++ llvm/trunk/test/CodeGen/X86/vec_ssubo.ll @@ -0,0 +1,2078 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @ssubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v1i32: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: subl %esi, %edi +; SSE-NEXT: seto %al +; SSE-NEXT: negl %eax +; SSE-NEXT: movl %edi, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: ssubo_v1i32: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: subl %esi, %edi +; AVX-NEXT: seto %al +; AVX-NEXT: negl %eax +; AVX-NEXT: movl %edi, (%rdx) +; AVX-NEXT: retq + %t = call {<1 x i32>, <1 x i1>} @llvm.ssub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) + %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 + %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 + %res = sext <1 x i1> %obit to <1 x i32> + store <1 x i32> %val, <1 x i32>* %p2 + ret <1 x i32> %res +} + +define <2 x i32> @ssubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: ssubo_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: psubq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllq $32, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: psubq %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psllq $32, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: psubq %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psllq $32, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsllq $32, %xmm0, %xmm1 +; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 +; AVX512-NEXT: vpmovqd %xmm0, (%rdi) +; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %t = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) + %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i32> %val, <2 x i32>* %p2 + ret <2 x i32> %res +} + +define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: ssubo_v3i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v3i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, 8(%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v3i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm3, %xmm2 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v3i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v3i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v3i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) + %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 + %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 + %res = sext <3 x i1> %obit to <3 x i32> + store <3 x i32> %val, <3 x i32>* %p2 + ret <3 x i32> %res +} + +define <4 x i32> @ssubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: ssubo_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltd %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) + %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i32> %val, <4 x i32>* %p2 + ret <4 x i32> %res +} + +define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: ssubo_v6i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: psubd %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: movq %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movq %xmm6, 16(%rdi) +; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v6i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT: psubd %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pandn %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: psubd %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: movq %xmm1, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm0, (%rcx) +; SSSE3-NEXT: movq %xmm6, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v6i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: movd %r9d, %xmm1 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm6 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE41-NEXT: psubd %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pandn %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE41-NEXT: psubd %xmm3, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: pandn %xmm4, %xmm6 +; SSE41-NEXT: movq %xmm1, 16(%rcx) +; SSE41-NEXT: movdqa %xmm0, (%rcx) +; SSE41-NEXT: movq %xmm6, 16(%rdi) +; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v6i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm8 +; AVX1-NEXT: vpsubd %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovq %xmm6, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v6i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovq %xmm2, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v6i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovq %xmm2, 16(%rdi) +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<6 x i32>, <6 x i1>} @llvm.ssub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) + %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 + %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 + %res = sext <6 x i1> %obit to <6 x i32> + store <6 x i32> %val, <6 x i32>* %p2 + ret <6 x i32> %res +} + +define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE-NEXT: pxor %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE-NEXT: pxor %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE-NEXT: pxor %xmm6, %xmm7 +; SSE-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm5 +; SSE-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: ssubo_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm9, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm8 +; AVX1-NEXT: vpsubd %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k0 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpnltd %ymm2, %ymm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) + %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i32> %val, <8 x i32>* %p2 + ret <8 x i32> %res +} + +define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE-LABEL: ssubo_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE-NEXT: pcmpeqd %xmm11, %xmm11 +; SSE-NEXT: pxor %xmm11, %xmm8 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE-NEXT: pxor %xmm11, %xmm9 +; SSE-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE-NEXT: psubd %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pxor %xmm11, %xmm4 +; SSE-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE-NEXT: pxor %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pcmpgtd %xmm5, %xmm9 +; SSE-NEXT: pxor %xmm11, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm11, %xmm4 +; SSE-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE-NEXT: psubd %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm11, %xmm5 +; SSE-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE-NEXT: pxor %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm11, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE-NEXT: pxor %xmm11, %xmm5 +; SSE-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE-NEXT: psubd %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm11, %xmm6 +; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pxor %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE-NEXT: pxor %xmm11, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE-NEXT: pxor %xmm11, %xmm6 +; SSE-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE-NEXT: psubd %xmm7, %xmm3 +; SSE-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE-NEXT: pxor %xmm11, %xmm10 +; SSE-NEXT: pcmpeqd %xmm6, %xmm10 +; SSE-NEXT: pxor %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm3, 48(%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: ssubo_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm8, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT: vpcmpgtd %xmm12, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm10 +; AVX1-NEXT: vpcmpeqd %xmm9, %xmm10, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm9 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm11, %xmm7, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 +; AVX1-NEXT: vpsubd %xmm8, %xmm12, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm8, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm10, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm10 +; AVX1-NEXT: vpcmpgtd %xmm10, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm13 +; AVX1-NEXT: vpcmpgtd %xmm13, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm3, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm11 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm12 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm12, %xmm7, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm11 +; AVX1-NEXT: vpsubd %xmm13, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm4 +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmovsxwd %xmm9, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm3, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm4, %ymm7 +; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm4, %ymm7 +; AVX2-NEXT: vpxor %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm0 +; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm2, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k0 +; AVX512-NEXT: vpcmpnltd %zmm2, %zmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpnltd %zmm2, %zmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) + %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i32> %val, <16 x i32>* %p2 + ret <16 x i32> %res +} + +define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: ssubo_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqb %xmm5, %xmm3 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpeqb %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtb %xmm1, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqb %xmm5, %xmm3 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm3 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pandn %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa %xmm6, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm6 +; AVX2-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %xmm6, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltb %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltb %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) + %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i8> %val, <16 x i8>* %p2 + ret <16 x i32> %res +} + +define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: ssubo_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqw %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 +; SSSE3-NEXT: psubw %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pcmpeqw %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtw %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pcmpgtw %xmm0, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE41-NEXT: psubw %xmm2, %xmm0 +; SSE41-NEXT: pcmpgtw %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqw %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm3, %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltw %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnltw %xmm2, %xmm1, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i16> %val, <8 x i16>* %p2 + ret <8 x i32> %res +} + +define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE2-LABEL: ssubo_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v2i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psubq %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psubq %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: pandn %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX2-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm1, %k0 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpnltq %xmm2, %xmm0, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 +; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %t = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) + %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i64> %val, <2 x i64>* %p2 + ret <2 x i32> %res +} + +define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: ssubo_v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $8, %xmm1 +; SSE2-NEXT: psrad $8, %xmm1 +; SSE2-NEXT: pslld $8, %xmm2 +; SSE2-NEXT: psrad $8, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pslld $8, %xmm0 +; SSE2-NEXT: psrad $8, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movw %ax, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movw %cx, 9(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movw %dx, 6(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movw %si, 3(%rdi) +; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 11(%rdi) +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 8(%rdi) +; SSE2-NEXT: shrl $16, %esi +; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pslld $8, %xmm1 +; SSSE3-NEXT: psrad $8, %xmm1 +; SSSE3-NEXT: pslld $8, %xmm2 +; SSSE3-NEXT: psrad $8, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pslld $8, %xmm0 +; SSSE3-NEXT: psrad $8, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movw %ax, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movw %cx, 9(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movw %dx, 6(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movw %si, 3(%rdi) +; SSSE3-NEXT: shrl $16, %eax +; SSSE3-NEXT: movb %al, 2(%rdi) +; SSSE3-NEXT: shrl $16, %ecx +; SSSE3-NEXT: movb %cl, 11(%rdi) +; SSSE3-NEXT: shrl $16, %edx +; SSSE3-NEXT: movb %dl, 8(%rdi) +; SSSE3-NEXT: shrl $16, %esi +; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $8, %xmm1 +; SSE41-NEXT: psrad $8, %xmm1 +; SSE41-NEXT: pslld $8, %xmm2 +; SSE41-NEXT: psrad $8, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pslld $8, %xmm0 +; SSE41-NEXT: psrad $8, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrd $3, %xmm2, %eax +; SSE41-NEXT: movw %ax, 9(%rdi) +; SSE41-NEXT: pextrd $2, %xmm2, %ecx +; SSE41-NEXT: movw %cx, 6(%rdi) +; SSE41-NEXT: pextrd $1, %xmm2, %edx +; SSE41-NEXT: movw %dx, 3(%rdi) +; SSE41-NEXT: movd %xmm2, %esi +; SSE41-NEXT: movw %si, (%rdi) +; SSE41-NEXT: shrl $16, %eax +; SSE41-NEXT: movb %al, 11(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 8(%rdi) +; SSE41-NEXT: shrl $16, %edx +; SSE41-NEXT: movb %dl, 5(%rdi) +; SSE41-NEXT: shrl $16, %esi +; SSE41-NEXT: movb %sil, 2(%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v4i24: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $8, %xmm1, %xmm1 +; AVX1-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpslld $8, %xmm1, %xmm0 +; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: movw %ax, 9(%rdi) +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %cx, 6(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm1, %edx +; AVX1-NEXT: movw %dx, 3(%rdi) +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: movw %si, (%rdi) +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: movb %al, 11(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 8(%rdi) +; AVX1-NEXT: shrl $16, %edx +; AVX1-NEXT: movb %dl, 5(%rdi) +; AVX1-NEXT: shrl $16, %esi +; AVX1-NEXT: movb %sil, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v4i24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $8, %xmm1, %xmm1 +; AVX2-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpslld $8, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: movw %ax, 9(%rdi) +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 6(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, %edx +; AVX2-NEXT: movw %dx, 3(%rdi) +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: movw %si, (%rdi) +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: movb %al, 11(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 8(%rdi) +; AVX2-NEXT: shrl $16, %edx +; AVX2-NEXT: movb %dl, 5(%rdi) +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: movb %sil, 2(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v4i24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $8, %xmm1, %xmm1 +; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1 +; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 +; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: movw %ax, 9(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %cx, 6(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %edx +; AVX512-NEXT: movw %dx, 3(%rdi) +; AVX512-NEXT: vmovd %xmm1, %esi +; AVX512-NEXT: movw %si, (%rdi) +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: movb %al, 11(%rdi) +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movb %cl, 8(%rdi) +; AVX512-NEXT: shrl $16, %edx +; AVX512-NEXT: movb %dl, 5(%rdi) +; AVX512-NEXT: shrl $16, %esi +; AVX512-NEXT: movb %sil, 2(%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i24>, <4 x i1>} @llvm.ssub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) + %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i24> %val, <4 x i24>* %p2 + ret <4 x i32> %res +} + +define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: ssubo_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: movb %al, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: ssubo_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: movb %al, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k2 +; AVX512-NEXT: kxorw %k2, %k1, %k3 +; AVX512-NEXT: kxorw %k2, %k0, %k0 +; AVX512-NEXT: kxnorw %k0, %k1, %k1 +; AVX512-NEXT: kandnw %k1, %k3, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i1>, <4 x i1>} @llvm.ssub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) + %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i1> %val, <4 x i1>* %p2 + ret <4 x i32> %res +} + +define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: ssubo_v2i128: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: sbbq %r11, %rax +; SSE2-NEXT: setns %bl +; SSE2-NEXT: testq %rcx, %rcx +; SSE2-NEXT: setns %cl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: setne %bpl +; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: setns %bl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: setne %cl +; SSE2-NEXT: andb %bpl, %cl +; SSE2-NEXT: movzbl %cl, %ebp +; SSE2-NEXT: testq %r9, %r9 +; SSE2-NEXT: setns %bl +; SSE2-NEXT: testq %rsi, %rsi +; SSE2-NEXT: setns %cl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: setne %r11b +; SSE2-NEXT: subq %r8, %rdi +; SSE2-NEXT: sbbq %r9, %rsi +; SSE2-NEXT: setns %bl +; SSE2-NEXT: cmpb %bl, %cl +; SSE2-NEXT: setne %cl +; SSE2-NEXT: andb %r11b, %cl +; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pinsrw $4, %ebp, %xmm0 +; SSE2-NEXT: movq %rdx, 16(%r10) +; SSE2-NEXT: movq %rdi, (%r10) +; SSE2-NEXT: movq %rax, 24(%r10) +; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: ssubo_v2i128: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: sbbq %r11, %rax +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: testq %rcx, %rcx +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: setne %bpl +; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: setne %cl +; SSSE3-NEXT: andb %bpl, %cl +; SSSE3-NEXT: movzbl %cl, %ebp +; SSSE3-NEXT: testq %r9, %r9 +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: testq %rsi, %rsi +; SSSE3-NEXT: setns %cl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: setne %r11b +; SSSE3-NEXT: subq %r8, %rdi +; SSSE3-NEXT: sbbq %r9, %rsi +; SSSE3-NEXT: setns %bl +; SSSE3-NEXT: cmpb %bl, %cl +; SSSE3-NEXT: setne %cl +; SSSE3-NEXT: andb %r11b, %cl +; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0 +; SSSE3-NEXT: movq %rdx, 16(%r10) +; SSSE3-NEXT: movq %rdi, (%r10) +; SSSE3-NEXT: movq %rax, 24(%r10) +; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: psllq $63, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: ssubo_v2i128: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: sbbq %r11, %rax +; SSE41-NEXT: setns %bl +; SSE41-NEXT: testq %rcx, %rcx +; SSE41-NEXT: setns %cl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setne %bpl +; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: setns %bl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setne %cl +; SSE41-NEXT: andb %bpl, %cl +; SSE41-NEXT: movzbl %cl, %ebp +; SSE41-NEXT: testq %r9, %r9 +; SSE41-NEXT: setns %bl +; SSE41-NEXT: testq %rsi, %rsi +; SSE41-NEXT: setns %cl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setne %r11b +; SSE41-NEXT: subq %r8, %rdi +; SSE41-NEXT: sbbq %r9, %rsi +; SSE41-NEXT: setns %bl +; SSE41-NEXT: cmpb %bl, %cl +; SSE41-NEXT: setne %cl +; SSE41-NEXT: andb %r11b, %cl +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%r10) +; SSE41-NEXT: movq %rdi, (%r10) +; SSE41-NEXT: movq %rax, 24(%r10) +; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: ssubo_v2i128: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: sbbq %r11, %rax +; AVX1-NEXT: setns %bl +; AVX1-NEXT: testq %rcx, %rcx +; AVX1-NEXT: setns %cl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setne %bpl +; AVX1-NEXT: testq %r11, %r11 +; AVX1-NEXT: setns %bl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setne %cl +; AVX1-NEXT: andb %bpl, %cl +; AVX1-NEXT: movzbl %cl, %ebp +; AVX1-NEXT: testq %r9, %r9 +; AVX1-NEXT: setns %bl +; AVX1-NEXT: testq %rsi, %rsi +; AVX1-NEXT: setns %cl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setne %r11b +; AVX1-NEXT: subq %r8, %rdi +; AVX1-NEXT: sbbq %r9, %rsi +; AVX1-NEXT: setns %bl +; AVX1-NEXT: cmpb %bl, %cl +; AVX1-NEXT: setne %cl +; AVX1-NEXT: andb %r11b, %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, 16(%r10) +; AVX1-NEXT: movq %rdi, (%r10) +; AVX1-NEXT: movq %rax, 24(%r10) +; AVX1-NEXT: movq %rsi, 8(%r10) +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ssubo_v2i128: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: sbbq %r11, %rax +; AVX2-NEXT: setns %bl +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: setns %cl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setne %bpl +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: setns %bl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setne %cl +; AVX2-NEXT: andb %bpl, %cl +; AVX2-NEXT: movzbl %cl, %ebp +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: setns %bl +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: setns %cl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setne %r11b +; AVX2-NEXT: subq %r8, %rdi +; AVX2-NEXT: sbbq %r9, %rsi +; AVX2-NEXT: setns %bl +; AVX2-NEXT: cmpb %bl, %cl +; AVX2-NEXT: setne %cl +; AVX2-NEXT: andb %r11b, %cl +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, 16(%r10) +; AVX2-NEXT: movq %rdi, (%r10) +; AVX2-NEXT: movq %rax, 24(%r10) +; AVX2-NEXT: movq %rsi, 8(%r10) +; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ssubo_v2i128: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq %rcx, %r14 +; AVX512-NEXT: sbbq %r11, %r14 +; AVX512-NEXT: setns %bl +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %bl, %cl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: testq %r11, %r11 +; AVX512-NEXT: setns %al +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: andb %bl, %al +; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: testq %r9, %r9 +; AVX512-NEXT: setns %al +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: setns %cl +; AVX512-NEXT: cmpb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: subq %r8, %rdi +; AVX512-NEXT: sbbq %r9, %rsi +; AVX512-NEXT: setns %bl +; AVX512-NEXT: cmpb %bl, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: andb %al, %cl +; AVX512-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT: movq %rdx, 16(%r10) +; AVX512-NEXT: movq %rdi, (%r10) +; AVX512-NEXT: movq %r14, 24(%r10) +; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: retq + %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) + %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i128> %val, <2 x i128>* %p2 + ret <2 x i32> %res +} Index: llvm/trunk/test/CodeGen/X86/vec_uaddo.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_uaddo.ll +++ llvm/trunk/test/CodeGen/X86/vec_uaddo.ll @@ -0,0 +1,1381 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @uaddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: uaddo_v1i32: +; SSE: # %bb.0: +; SSE-NEXT: addl %esi, %edi +; SSE-NEXT: sbbl %eax, %eax +; SSE-NEXT: movl %edi, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: uaddo_v1i32: +; AVX: # %bb.0: +; AVX-NEXT: addl %esi, %edi +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: movl %edi, (%rdx) +; AVX-NEXT: retq + %t = call {<1 x i32>, <1 x i1>} @llvm.uadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) + %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 + %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 + %res = sext <1 x i1> %obit to <1 x i32> + store <1 x i32> %val, <1 x i32>* %p2 + ret <1 x i32> %res +} + +define <2 x i32> @uaddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT: vpmovqd %xmm0, (%rdi) +; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %t = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) + %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i32> %val, <2 x i32>* %p2 + ret <2 x i32> %res +} + +define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v3i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: movq %xmm1, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, 8(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v3i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: movq %xmm1, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, 8(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v3i32: +; SSE41: # %bb.0: +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) +; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v3i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v3i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v3i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) + %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 + %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 + %res = sext <3 x i1> %obit to <3 x i32> + store <3 x i32> %val, <3 x i32>* %p2 + ret <3 x i32> %res +} + +define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pmaxud %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) + %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i32> %val, <4 x i32>* %p2 + ret <4 x i32> %res +} + +define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v6i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movd %r9d, %xmm2 +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movq %xmm3, 16(%rcx) +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movq %xmm2, 16(%rdi) +; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v6i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: movd %r9d, %xmm2 +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, (%rcx) +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: paddd %xmm2, %xmm3 +; SSSE3-NEXT: movq %xmm3, 16(%rcx) +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: movq %xmm2, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v6i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movd %r9d, %xmm2 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: paddd %xmm0, %xmm3 +; SSE41-NEXT: pmaxud %xmm3, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: pmaxud %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: movq %xmm1, 16(%rcx) +; SSE41-NEXT: movdqa %xmm3, (%rcx) +; SSE41-NEXT: movq %xmm2, 16(%rdi) +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v6i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vmovq %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v6i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovq %xmm2, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v6i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovq %xmm2, 16(%rdi) +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) + %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 + %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 + %res = sext <6 x i1> %obit to <6 x i32> + store <6 x i32> %val, <6 x i32>* %p2 + ret <6 x i32> %res +} + +define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, 16(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: paddd %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: paddd %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: pmaxud %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm3 +; SSE41-NEXT: pmaxud %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, 16(%rdi) +; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) + %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i32> %val, <8 x i32>* %p2 + ret <8 x i32> %res +} + +define <16 x i32> @uaddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE2-LABEL: uaddo_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm5, 16(%rdi) +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm6, 32(%rdi) +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm7, 48(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v16i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: paddd %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 +; SSSE3-NEXT: paddd %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, 16(%rdi) +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: paddd %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: paddd %xmm3, %xmm7 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: pxor %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, 48(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: paddd %xmm0, %xmm4 +; SSE41-NEXT: pmaxud %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm5 +; SSE41-NEXT: pmaxud %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT: pxor %xmm8, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm6 +; SSE41-NEXT: pmaxud %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm7 +; SSE41-NEXT: pmaxud %xmm7, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm7, 48(%rdi) +; SSE41-NEXT: movdqa %xmm6, 32(%rdi) +; SSE41-NEXT: movdqa %xmm5, 16(%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaxud %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmaxud %xmm7, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmovaps %ymm3, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpmaxud %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm2, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpltud %zmm0, %zmm1, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) + %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i32> %val, <16 x i32>* %p2 + ret <16 x i32> %res +} + +define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: uaddo_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: paddb %xmm0, %xmm1 +; SSSE3-NEXT: pmaxub %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pmaxub %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: movdqa %xmm1, (%rdi) +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpmaxub %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpmaxub %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) + %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i8> %val, <16 x i8>* %p2 + ret <16 x i32> %res +} + +define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: uaddo_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: pmaxuw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: movdqa %xmm1, (%rdi) +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpltuw %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i16> %val, <8 x i16>* %p2 + ret <8 x i32> %res +} + +define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE-LABEL: uaddo_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: uaddo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) + %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i64> %val, <2 x i64>* %p2 + ret <2 x i32> %res +} + +define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: uaddo_v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movw %ax, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movw %cx, 9(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movw %dx, 6(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movw %si, 3(%rdi) +; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 11(%rdi) +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 8(%rdi) +; SSE2-NEXT: shrl $16, %esi +; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: paddd %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movw %ax, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movw %cx, 9(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movw %dx, 6(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movw %si, 3(%rdi) +; SSSE3-NEXT: shrl $16, %eax +; SSSE3-NEXT: movb %al, 2(%rdi) +; SSSE3-NEXT: shrl $16, %ecx +; SSSE3-NEXT: movb %cl, 11(%rdi) +; SSSE3-NEXT: shrl $16, %edx +; SSSE3-NEXT: movb %dl, 8(%rdi) +; SSSE3-NEXT: shrl $16, %esi +; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: movw %ax, 9(%rdi) +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: movw %cx, 6(%rdi) +; SSE41-NEXT: pextrd $1, %xmm0, %edx +; SSE41-NEXT: movw %dx, 3(%rdi) +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: movw %si, (%rdi) +; SSE41-NEXT: shrl $16, %eax +; SSE41-NEXT: movb %al, 11(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 8(%rdi) +; SSE41-NEXT: shrl $16, %edx +; SSE41-NEXT: movb %dl, 5(%rdi) +; SSE41-NEXT: shrl $16, %esi +; SSE41-NEXT: movb %sil, 2(%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v4i24: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: movw %ax, 9(%rdi) +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %cx, 6(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm1, %edx +; AVX1-NEXT: movw %dx, 3(%rdi) +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: movw %si, (%rdi) +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: movb %al, 11(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 8(%rdi) +; AVX1-NEXT: shrl $16, %edx +; AVX1-NEXT: movb %dl, 5(%rdi) +; AVX1-NEXT: shrl $16, %esi +; AVX1-NEXT: movb %sil, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v4i24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: movw %ax, 9(%rdi) +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 6(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, %edx +; AVX2-NEXT: movw %dx, 3(%rdi) +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: movw %si, (%rdi) +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: movb %al, 11(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 8(%rdi) +; AVX2-NEXT: shrl $16, %edx +; AVX2-NEXT: movb %dl, 5(%rdi) +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: movb %sil, 2(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v4i24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: movw %ax, 9(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %cx, 6(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %edx +; AVX512-NEXT: movw %dx, 3(%rdi) +; AVX512-NEXT: vmovd %xmm1, %esi +; AVX512-NEXT: movw %si, (%rdi) +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: movb %al, 11(%rdi) +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movb %cl, 8(%rdi) +; AVX512-NEXT: shrl $16, %edx +; AVX512-NEXT: movb %dl, 5(%rdi) +; AVX512-NEXT: shrl $16, %esi +; AVX512-NEXT: movb %sil, 2(%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) + %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i24> %val, <4 x i24>* %p2 + ret <4 x i32> %res +} + +define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: uaddo_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: movb %al, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uaddo_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: movb %al, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k2 +; AVX512-NEXT: kxnorw %k1, %k0, %k1 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) + %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i1> %val, <4 x i1>* %p2 + ret <4 x i32> %res +} + +define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: uaddo_v2i128: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: setb %al +; SSE2-NEXT: movzbl %al, %r11d +; SSE2-NEXT: addq %r8, %rdi +; SSE2-NEXT: adcq %r9, %rsi +; SSE2-NEXT: setb %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE2-NEXT: movq %rdx, 16(%r10) +; SSE2-NEXT: movq %rdi, (%r10) +; SSE2-NEXT: movq %rcx, 24(%r10) +; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: uaddo_v2i128: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: setb %al +; SSSE3-NEXT: movzbl %al, %r11d +; SSSE3-NEXT: addq %r8, %rdi +; SSSE3-NEXT: adcq %r9, %rsi +; SSSE3-NEXT: setb %al +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0 +; SSSE3-NEXT: movq %rdx, 16(%r10) +; SSSE3-NEXT: movq %rdi, (%r10) +; SSSE3-NEXT: movq %rcx, 24(%r10) +; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: psllq $63, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: uaddo_v2i128: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: setb %al +; SSE41-NEXT: movzbl %al, %r11d +; SSE41-NEXT: addq %r8, %rdi +; SSE41-NEXT: adcq %r9, %rsi +; SSE41-NEXT: setb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrb $8, %r11d, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%r10) +; SSE41-NEXT: movq %rdi, (%r10) +; SSE41-NEXT: movq %rcx, 24(%r10) +; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: uaddo_v2i128: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: setb %al +; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: addq %r8, %rdi +; AVX1-NEXT: adcq %r9, %rsi +; AVX1-NEXT: setb %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, 16(%r10) +; AVX1-NEXT: movq %rdi, (%r10) +; AVX1-NEXT: movq %rcx, 24(%r10) +; AVX1-NEXT: movq %rsi, 8(%r10) +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v2i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: setb %al +; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: addq %r8, %rdi +; AVX2-NEXT: adcq %r9, %rsi +; AVX2-NEXT: setb %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, 16(%r10) +; AVX2-NEXT: movq %rdi, (%r10) +; AVX2-NEXT: movq %rcx, 24(%r10) +; AVX2-NEXT: movq %rsi, 8(%r10) +; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uaddo_v2i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: setb %al +; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: addq %r8, %rdi +; AVX512-NEXT: adcq %r9, %rsi +; AVX512-NEXT: setb %al +; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT: movq %rdx, 16(%r10) +; AVX512-NEXT: movq %rdi, (%r10) +; AVX512-NEXT: movq %rcx, 24(%r10) +; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) + %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i128> %val, <2 x i128>* %p2 + ret <2 x i32> %res +} Index: llvm/trunk/test/CodeGen/X86/vec_usubo.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_usubo.ll +++ llvm/trunk/test/CodeGen/X86/vec_usubo.ll @@ -0,0 +1,1422 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 + +declare {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32>, <1 x i32>) +declare {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) +declare {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32>, <3 x i32>) +declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32>, <6 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>) + +declare {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24>, <4 x i24>) +declare {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1>, <4 x i1>) +declare {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128>, <2 x i128>) + +define <1 x i32> @usubo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { +; SSE-LABEL: usubo_v1i32: +; SSE: # %bb.0: +; SSE-NEXT: subl %esi, %edi +; SSE-NEXT: sbbl %eax, %eax +; SSE-NEXT: movl %edi, (%rdx) +; SSE-NEXT: retq +; +; AVX-LABEL: usubo_v1i32: +; AVX: # %bb.0: +; AVX-NEXT: subl %esi, %edi +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: movl %edi, (%rdx) +; AVX-NEXT: retq + %t = call {<1 x i32>, <1 x i1>} @llvm.usub.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) + %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 + %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 + %res = sext <1 x i1> %obit to <1 x i32> + store <1 x i32> %val, <1 x i32>* %p2 + ret <1 x i32> %res +} + +define <2 x i32> @usubo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psubq %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: psubq %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512-NEXT: vpmovqd %xmm0, (%rdi) +; AVX512-NEXT: vpcmpeqq %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %t = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) + %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i32> %val, <2 x i32>* %p2 + ret <2 x i32> %res +} + +define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v3i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v3i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, 8(%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v3i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm2, 8(%rdi) +; SSE41-NEXT: movq %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v3i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v3i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v3i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) + %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 + %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 + %res = sext <3 x i1> %obit to <3 x i32> + store <3 x i32> %val, <3 x i32>* %p2 + ret <3 x i32> %res +} + +define <4 x i32> @usubo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: psubd %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) + %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i32> %val, <4 x i32>* %p2 + ret <4 x i32> %res +} + +define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v6i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm3, %xmm0 +; SSE2-NEXT: movq %xmm0, 16(%rcx) +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, 16(%rdi) +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v6i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd %r8d, %xmm0 +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: psubd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm4, (%rcx) +; SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psubd %xmm3, %xmm0 +; SSSE3-NEXT: movq %xmm0, 16(%rcx) +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 +; SSSE3-NEXT: movq %xmm0, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v6i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 +; SSE41-NEXT: movd %r9d, %xmm2 +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 +; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psubd %xmm3, %xmm4 +; SSE41-NEXT: pminud %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psubd %xmm1, %xmm5 +; SSE41-NEXT: pminud %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: movq %xmm5, 16(%rcx) +; SSE41-NEXT: movdqa %xmm4, (%rcx) +; SSE41-NEXT: movq %xmm2, 16(%rdi) +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v6i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vmovq %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v6i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovq %xmm2, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v6i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovq %xmm2, 16(%rdi) +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<6 x i32>, <6 x i1>} @llvm.usub.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) + %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 + %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 + %res = sext <6 x i1> %obit to <6 x i32> + store <6 x i32> %val, <6 x i32>* %p2 + ret <6 x i32> %res +} + +define <8 x i32> @usubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: psubd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm1, 16(%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: psubd %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psubd %xmm2, %xmm4 +; SSE41-NEXT: pminud %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psubd %xmm3, %xmm5 +; SSE41-NEXT: pminud %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm5, 16(%rdi) +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpminud %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpminud %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpcmpnleud %ymm0, %ymm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) + %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i32> %val, <8 x i32>* %p2 + ret <8 x i32> %res +} + +define <16 x i32> @usubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { +; SSE2-LABEL: usubo_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: psubd %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: psubd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdi) +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: psubd %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm3, 48(%rdi) +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v16i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: psubd %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: psubd %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: psubd %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: psubd %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v16i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: psubd %xmm4, %xmm8 +; SSE41-NEXT: pminud %xmm8, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psubd %xmm5, %xmm4 +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psubd %xmm6, %xmm5 +; SSE41-NEXT: pminud %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm7, %xmm6 +; SSE41-NEXT: pminud %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE41-NEXT: pxor %xmm9, %xmm3 +; SSE41-NEXT: movdqa %xmm6, 48(%rdi) +; SSE41-NEXT: movdqa %xmm5, 32(%rdi) +; SSE41-NEXT: movdqa %xmm4, 16(%rdi) +; SSE41-NEXT: movdqa %xmm8, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpsubd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpminud %xmm7, %xmm5, %xmm7 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm7 +; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm7 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpminud %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vmovaps %ymm3, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpminud %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm2, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpcmpnleud %zmm0, %zmm1, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) + %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i32> %val, <16 x i32>* %p2 + ret <16 x i32> %res +} + +define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { +; SSE2-LABEL: usubo_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psubb %xmm1, %xmm4 +; SSE2-NEXT: pminub %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: psubb %xmm1, %xmm4 +; SSSE3-NEXT: pminub %xmm4, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psubb %xmm1, %xmm4 +; SSE41-NEXT: pminub %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm3 +; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: movdqa %xmm4, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpminub %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpminub %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnleub %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) + %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 + %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 + %res = sext <16 x i1> %obit to <16 x i32> + store <16 x i8> %val, <16 x i8>* %p2 + ret <16 x i32> %res +} + +define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { +; SSE2-LABEL: usubo_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: psubw %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, (%rdi) +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psubw %xmm1, %xmm2 +; SSE41-NEXT: pminuw %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminuw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnleuw %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: retq + %t = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 + %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 + %res = sext <8 x i1> %obit to <8 x i32> + store <8 x i16> %val, <8 x i16>* %p2 + ret <8 x i32> %res +} + +define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { +; SSE-LABEL: usubo_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: usubo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpcmpnleuq %xmm0, %xmm1, %k1 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %t = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) + %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i64> %val, <2 x i64>* %p2 + ret <2 x i32> %res +} + +define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { +; SSE2-LABEL: usubo_v4i24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: movw %ax, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movw %cx, 9(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %edx +; SSE2-NEXT: movw %dx, 6(%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movw %si, 3(%rdi) +; SSE2-NEXT: shrl $16, %eax +; SSE2-NEXT: movb %al, 2(%rdi) +; SSE2-NEXT: shrl $16, %ecx +; SSE2-NEXT: movb %cl, 11(%rdi) +; SSE2-NEXT: shrl $16, %edx +; SSE2-NEXT: movb %dl, 8(%rdi) +; SSE2-NEXT: shrl $16, %esi +; SSE2-NEXT: movb %sil, 5(%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v4i24: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: psubd %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: movw %ax, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %ecx +; SSSE3-NEXT: movw %cx, 9(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: movw %dx, 6(%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3] +; SSSE3-NEXT: movd %xmm1, %esi +; SSSE3-NEXT: movw %si, 3(%rdi) +; SSSE3-NEXT: shrl $16, %eax +; SSSE3-NEXT: movb %al, 2(%rdi) +; SSSE3-NEXT: shrl $16, %ecx +; SSSE3-NEXT: movb %cl, 11(%rdi) +; SSSE3-NEXT: shrl $16, %edx +; SSSE3-NEXT: movb %dl, 8(%rdi) +; SSSE3-NEXT: shrl $16, %esi +; SSSE3-NEXT: movb %sil, 5(%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v4i24: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: movw %ax, 9(%rdi) +; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: movw %cx, 6(%rdi) +; SSE41-NEXT: pextrd $1, %xmm0, %edx +; SSE41-NEXT: movw %dx, 3(%rdi) +; SSE41-NEXT: movd %xmm0, %esi +; SSE41-NEXT: movw %si, (%rdi) +; SSE41-NEXT: shrl $16, %eax +; SSE41-NEXT: movb %al, 11(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 8(%rdi) +; SSE41-NEXT: shrl $16, %edx +; SSE41-NEXT: movb %dl, 5(%rdi) +; SSE41-NEXT: shrl $16, %esi +; SSE41-NEXT: movb %sil, 2(%rdi) +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v4i24: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.35098856E-38,2.35098856E-38,2.35098856E-38,2.35098856E-38] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: movw %ax, 9(%rdi) +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %cx, 6(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm1, %edx +; AVX1-NEXT: movw %dx, 3(%rdi) +; AVX1-NEXT: vmovd %xmm1, %esi +; AVX1-NEXT: movw %si, (%rdi) +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: movb %al, 11(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 8(%rdi) +; AVX1-NEXT: shrl $16, %edx +; AVX1-NEXT: movb %dl, 5(%rdi) +; AVX1-NEXT: shrl $16, %esi +; AVX1-NEXT: movb %sil, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v4i24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: movw %ax, 9(%rdi) +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 6(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, %edx +; AVX2-NEXT: movw %dx, 3(%rdi) +; AVX2-NEXT: vmovd %xmm1, %esi +; AVX2-NEXT: movw %si, (%rdi) +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: movb %al, 11(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 8(%rdi) +; AVX2-NEXT: shrl $16, %edx +; AVX2-NEXT: movb %dl, 5(%rdi) +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: movb %sil, 2(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v4i24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: movw %ax, 9(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %cx, 6(%rdi) +; AVX512-NEXT: vpextrd $1, %xmm1, %edx +; AVX512-NEXT: movw %dx, 3(%rdi) +; AVX512-NEXT: vmovd %xmm1, %esi +; AVX512-NEXT: movw %si, (%rdi) +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: movb %al, 11(%rdi) +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movb %cl, 8(%rdi) +; AVX512-NEXT: shrl $16, %edx +; AVX512-NEXT: movb %dl, 5(%rdi) +; AVX512-NEXT: shrl $16, %esi +; AVX512-NEXT: movb %sil, 2(%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i24>, <4 x i1>} @llvm.usub.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) + %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i24> %val, <4 x i24>* %p2 + ret <4 x i32> %res +} + +define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { +; SSE-LABEL: usubo_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: movb %al, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: usubo_v4i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v4i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: movb %al, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v4i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: retq + %t = call {<4 x i1>, <4 x i1>} @llvm.usub.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) + %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 + %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 + %res = sext <4 x i1> %obit to <4 x i32> + store <4 x i1> %val, <4 x i1>* %p2 + ret <4 x i32> %res +} + +define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { +; SSE2-LABEL: usubo_v2i128: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx +; SSE2-NEXT: setb %al +; SSE2-NEXT: movzbl %al, %r11d +; SSE2-NEXT: subq %r8, %rdi +; SSE2-NEXT: sbbq %r9, %rsi +; SSE2-NEXT: setb %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %r11d, %xmm0 +; SSE2-NEXT: movq %rdx, 16(%r10) +; SSE2-NEXT: movq %rdi, (%r10) +; SSE2-NEXT: movq %rcx, 24(%r10) +; SSE2-NEXT: movq %rsi, 8(%r10) +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: usubo_v2i128: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx +; SSSE3-NEXT: setb %al +; SSSE3-NEXT: movzbl %al, %r11d +; SSSE3-NEXT: subq %r8, %rdi +; SSSE3-NEXT: sbbq %r9, %rsi +; SSSE3-NEXT: setb %al +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pinsrw $4, %r11d, %xmm0 +; SSSE3-NEXT: movq %rdx, 16(%r10) +; SSSE3-NEXT: movq %rdi, (%r10) +; SSSE3-NEXT: movq %rcx, 24(%r10) +; SSSE3-NEXT: movq %rsi, 8(%r10) +; SSSE3-NEXT: psllq $63, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: usubo_v2i128: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: setb %al +; SSE41-NEXT: movzbl %al, %r11d +; SSE41-NEXT: subq %r8, %rdi +; SSE41-NEXT: sbbq %r9, %rsi +; SSE41-NEXT: setb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrb $8, %r11d, %xmm0 +; SSE41-NEXT: movq %rdx, 16(%r10) +; SSE41-NEXT: movq %rdi, (%r10) +; SSE41-NEXT: movq %rcx, 24(%r10) +; SSE41-NEXT: movq %rsi, 8(%r10) +; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: usubo_v2i128: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: setb %al +; AVX1-NEXT: movzbl %al, %r11d +; AVX1-NEXT: subq %r8, %rdi +; AVX1-NEXT: sbbq %r9, %rsi +; AVX1-NEXT: setb %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, 16(%r10) +; AVX1-NEXT: movq %rdi, (%r10) +; AVX1-NEXT: movq %rcx, 24(%r10) +; AVX1-NEXT: movq %rsi, 8(%r10) +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v2i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: setb %al +; AVX2-NEXT: movzbl %al, %r11d +; AVX2-NEXT: subq %r8, %rdi +; AVX2-NEXT: sbbq %r9, %rsi +; AVX2-NEXT: setb %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, 16(%r10) +; AVX2-NEXT: movq %rdi, (%r10) +; AVX2-NEXT: movq %rcx, 24(%r10) +; AVX2-NEXT: movq %rsi, 8(%r10) +; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: usubo_v2i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: setb %al +; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: subq %r8, %rdi +; AVX512-NEXT: sbbq %r9, %rsi +; AVX512-NEXT: setb %al +; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: kmovw -{{[0-9]+}}(%rsp), %k1 +; AVX512-NEXT: movq %rdx, 16(%r10) +; AVX512-NEXT: movq %rdi, (%r10) +; AVX512-NEXT: movq %rcx, 24(%r10) +; AVX512-NEXT: movq %rsi, 8(%r10) +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: retq + %t = call {<2 x i128>, <2 x i1>} @llvm.usub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) + %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 + %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 + %res = sext <2 x i1> %obit to <2 x i32> + store <2 x i128> %val, <2 x i128>* %p2 + ret <2 x i32> %res +}